1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 #include <thrust/async/for_each.h> 17 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 18 #include <cooperative_groups.h> 19 #endif 20 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 21 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 22 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 23 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 24 25 typedef enum { 26 CUSPARSE_MV_ALG_DEFAULT = 0, 27 CUSPARSE_COOMV_ALG = 1, 28 CUSPARSE_CSRMV_ALG1 = 2, 29 CUSPARSE_CSRMV_ALG2 = 3 30 } cusparseSpMVAlg_t; 31 32 typedef enum { 33 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 34 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 35 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 36 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 37 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 38 CUSPARSE_SPMM_ALG_DEFAULT = 0, 39 CUSPARSE_SPMM_COO_ALG1 = 1, 40 CUSPARSE_SPMM_COO_ALG2 = 2, 41 CUSPARSE_SPMM_COO_ALG3 = 3, 42 CUSPARSE_SPMM_COO_ALG4 = 5, 43 CUSPARSE_SPMM_CSR_ALG1 = 4, 44 CUSPARSE_SPMM_CSR_ALG2 = 6, 45 } cusparseSpMMAlg_t; 46 47 typedef enum { 48 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 49 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 50 } cusparseCsr2CscAlg_t; 51 */ 52 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 53 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 54 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 55 #endif 56 57 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 58 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 59 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 60 61 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat,Mat,IS,IS,const MatFactorInfo*); 62 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat,Mat,const MatFactorInfo*); 63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 66 67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 81 82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 86 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 87 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 88 89 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 90 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 91 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 92 93 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 94 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 95 96 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 97 98 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 99 { 100 cusparseStatus_t stat; 101 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 102 103 PetscFunctionBegin; 104 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 105 cusparsestruct->stream = stream; 106 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 107 PetscFunctionReturn(0); 108 } 109 110 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 111 { 112 cusparseStatus_t stat; 113 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 114 115 PetscFunctionBegin; 116 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 117 if (cusparsestruct->handle != handle) { 118 if (cusparsestruct->handle) { 119 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 120 } 121 cusparsestruct->handle = handle; 122 } 123 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 124 PetscFunctionReturn(0); 125 } 126 127 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 128 { 129 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 130 PetscBool flg; 131 PetscErrorCode ierr; 132 133 PetscFunctionBegin; 134 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 135 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 136 if (cusparsestruct->handle) cusparsestruct->handle = 0; 137 PetscFunctionReturn(0); 138 } 139 140 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 141 { 142 PetscFunctionBegin; 143 *type = MATSOLVERCUSPARSE; 144 PetscFunctionReturn(0); 145 } 146 147 /*MC 148 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 149 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 150 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 151 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 152 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 153 algorithms are not recommended. This class does NOT support direct solver operations. 154 155 Level: beginner 156 157 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 158 M*/ 159 160 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 161 { 162 PetscErrorCode ierr; 163 PetscInt n = A->rmap->n; 164 165 PetscFunctionBegin; 166 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 167 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 168 (*B)->factortype = ftype; 169 (*B)->useordering = PETSC_TRUE; 170 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 171 172 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 173 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 174 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 175 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 176 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 177 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 178 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 179 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 180 181 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 182 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 183 PetscFunctionReturn(0); 184 } 185 186 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 187 { 188 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 189 190 PetscFunctionBegin; 191 switch (op) { 192 case MAT_CUSPARSE_MULT: 193 cusparsestruct->format = format; 194 break; 195 case MAT_CUSPARSE_ALL: 196 cusparsestruct->format = format; 197 break; 198 default: 199 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 200 } 201 PetscFunctionReturn(0); 202 } 203 204 /*@ 205 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 206 operation. Only the MatMult operation can use different GPU storage formats 207 for MPIAIJCUSPARSE matrices. 208 Not Collective 209 210 Input Parameters: 211 + A - Matrix of type SEQAIJCUSPARSE 212 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 213 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 214 215 Output Parameter: 216 217 Level: intermediate 218 219 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 220 @*/ 221 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 222 { 223 PetscErrorCode ierr; 224 225 PetscFunctionBegin; 226 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 227 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 228 PetscFunctionReturn(0); 229 } 230 231 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 232 { 233 PetscErrorCode ierr; 234 235 PetscFunctionBegin; 236 switch (op) { 237 case MAT_FORM_EXPLICIT_TRANSPOSE: 238 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 239 if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 240 A->form_explicit_transpose = flg; 241 break; 242 default: 243 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 244 break; 245 } 246 PetscFunctionReturn(0); 247 } 248 249 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 250 251 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 252 { 253 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 254 IS isrow = b->row,iscol = b->col; 255 PetscBool row_identity,col_identity; 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 260 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 261 B->offloadmask = PETSC_OFFLOAD_CPU; 262 /* determine which version of MatSolve needs to be used. */ 263 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 264 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 265 if (row_identity && col_identity) { 266 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 267 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 268 B->ops->matsolve = NULL; 269 B->ops->matsolvetranspose = NULL; 270 } else { 271 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 272 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 273 B->ops->matsolve = NULL; 274 B->ops->matsolvetranspose = NULL; 275 } 276 277 /* get the triangular factors */ 278 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 279 PetscFunctionReturn(0); 280 } 281 282 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 283 { 284 PetscErrorCode ierr; 285 MatCUSPARSEStorageFormat format; 286 PetscBool flg; 287 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 288 289 PetscFunctionBegin; 290 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 291 if (A->factortype == MAT_FACTOR_NONE) { 292 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 293 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 294 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 295 296 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 297 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 298 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 299 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 300 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 301 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 302 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 303 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 304 305 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 306 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 307 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 308 309 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 310 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 311 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 312 #endif 313 } 314 ierr = PetscOptionsTail();CHKERRQ(ierr); 315 PetscFunctionReturn(0); 316 } 317 318 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 319 { 320 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 321 PetscErrorCode ierr; 322 323 PetscFunctionBegin; 324 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 325 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 326 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 327 PetscFunctionReturn(0); 328 } 329 330 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 331 { 332 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 333 PetscErrorCode ierr; 334 335 PetscFunctionBegin; 336 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 337 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 338 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 339 PetscFunctionReturn(0); 340 } 341 342 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 343 { 344 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 345 PetscErrorCode ierr; 346 347 PetscFunctionBegin; 348 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 349 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 350 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 351 PetscFunctionReturn(0); 352 } 353 354 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 355 { 356 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 357 PetscErrorCode ierr; 358 359 PetscFunctionBegin; 360 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 361 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 362 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 363 PetscFunctionReturn(0); 364 } 365 366 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 367 { 368 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 369 PetscInt n = A->rmap->n; 370 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 371 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 372 cusparseStatus_t stat; 373 const PetscInt *ai = a->i,*aj = a->j,*vi; 374 const MatScalar *aa = a->a,*v; 375 PetscInt *AiLo, *AjLo; 376 PetscInt i,nz, nzLower, offset, rowOffset; 377 PetscErrorCode ierr; 378 cudaError_t cerr; 379 380 PetscFunctionBegin; 381 if (!n) PetscFunctionReturn(0); 382 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 383 try { 384 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 385 nzLower=n+ai[n]-ai[1]; 386 if (!loTriFactor) { 387 PetscScalar *AALo; 388 389 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 390 391 /* Allocate Space for the lower triangular matrix */ 392 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 393 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 394 395 /* Fill the lower triangular matrix */ 396 AiLo[0] = (PetscInt) 0; 397 AiLo[n] = nzLower; 398 AjLo[0] = (PetscInt) 0; 399 AALo[0] = (MatScalar) 1.0; 400 v = aa; 401 vi = aj; 402 offset = 1; 403 rowOffset= 1; 404 for (i=1; i<n; i++) { 405 nz = ai[i+1] - ai[i]; 406 /* additional 1 for the term on the diagonal */ 407 AiLo[i] = rowOffset; 408 rowOffset += nz+1; 409 410 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 411 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 412 413 offset += nz; 414 AjLo[offset] = (PetscInt) i; 415 AALo[offset] = (MatScalar) 1.0; 416 offset += 1; 417 418 v += nz; 419 vi += nz; 420 } 421 422 /* allocate space for the triangular factor information */ 423 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 424 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 425 /* Create the matrix description */ 426 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 427 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 428 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 429 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 430 #else 431 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 432 #endif 433 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 434 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 435 436 /* set the operation */ 437 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 438 439 /* set the matrix */ 440 loTriFactor->csrMat = new CsrMatrix; 441 loTriFactor->csrMat->num_rows = n; 442 loTriFactor->csrMat->num_cols = n; 443 loTriFactor->csrMat->num_entries = nzLower; 444 445 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 446 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 447 448 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 449 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 450 451 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 452 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 453 454 /* Create the solve analysis information */ 455 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 456 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 457 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 458 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 459 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 460 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 461 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 462 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 463 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 464 #endif 465 466 /* perform the solve analysis */ 467 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 468 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 469 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 470 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 471 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 472 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 473 #endif 474 );CHKERRCUSPARSE(stat); 475 cerr = WaitForCUDA();CHKERRCUDA(cerr); 476 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 477 478 /* assign the pointer */ 479 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 480 loTriFactor->AA_h = AALo; 481 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 482 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 483 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 484 } else { /* update values only */ 485 if (!loTriFactor->AA_h) { 486 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 487 } 488 /* Fill the lower triangular matrix */ 489 loTriFactor->AA_h[0] = 1.0; 490 v = aa; 491 vi = aj; 492 offset = 1; 493 for (i=1; i<n; i++) { 494 nz = ai[i+1] - ai[i]; 495 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 496 offset += nz; 497 loTriFactor->AA_h[offset] = 1.0; 498 offset += 1; 499 v += nz; 500 } 501 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 502 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 503 } 504 } catch(char *ex) { 505 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 506 } 507 } 508 PetscFunctionReturn(0); 509 } 510 511 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 512 { 513 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 514 PetscInt n = A->rmap->n; 515 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 516 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 517 cusparseStatus_t stat; 518 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 519 const MatScalar *aa = a->a,*v; 520 PetscInt *AiUp, *AjUp; 521 PetscInt i,nz, nzUpper, offset; 522 PetscErrorCode ierr; 523 cudaError_t cerr; 524 525 PetscFunctionBegin; 526 if (!n) PetscFunctionReturn(0); 527 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 528 try { 529 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 530 nzUpper = adiag[0]-adiag[n]; 531 if (!upTriFactor) { 532 PetscScalar *AAUp; 533 534 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 535 536 /* Allocate Space for the upper triangular matrix */ 537 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 538 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 539 540 /* Fill the upper triangular matrix */ 541 AiUp[0]=(PetscInt) 0; 542 AiUp[n]=nzUpper; 543 offset = nzUpper; 544 for (i=n-1; i>=0; i--) { 545 v = aa + adiag[i+1] + 1; 546 vi = aj + adiag[i+1] + 1; 547 548 /* number of elements NOT on the diagonal */ 549 nz = adiag[i] - adiag[i+1]-1; 550 551 /* decrement the offset */ 552 offset -= (nz+1); 553 554 /* first, set the diagonal elements */ 555 AjUp[offset] = (PetscInt) i; 556 AAUp[offset] = (MatScalar)1./v[nz]; 557 AiUp[i] = AiUp[i+1] - (nz+1); 558 559 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 560 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 561 } 562 563 /* allocate space for the triangular factor information */ 564 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 565 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 566 567 /* Create the matrix description */ 568 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 569 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 570 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 571 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 572 #else 573 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 574 #endif 575 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 576 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 577 578 /* set the operation */ 579 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 580 581 /* set the matrix */ 582 upTriFactor->csrMat = new CsrMatrix; 583 upTriFactor->csrMat->num_rows = n; 584 upTriFactor->csrMat->num_cols = n; 585 upTriFactor->csrMat->num_entries = nzUpper; 586 587 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 588 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 589 590 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 591 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 592 593 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 594 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 595 596 /* Create the solve analysis information */ 597 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 598 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 599 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 600 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 601 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 602 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 603 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 604 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 605 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 606 #endif 607 608 /* perform the solve analysis */ 609 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 610 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 611 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 612 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 613 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 614 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 615 #endif 616 );CHKERRCUSPARSE(stat); 617 cerr = WaitForCUDA();CHKERRCUDA(cerr); 618 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 619 620 /* assign the pointer */ 621 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 622 upTriFactor->AA_h = AAUp; 623 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 624 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 625 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 626 } else { 627 if (!upTriFactor->AA_h) { 628 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 629 } 630 /* Fill the upper triangular matrix */ 631 offset = nzUpper; 632 for (i=n-1; i>=0; i--) { 633 v = aa + adiag[i+1] + 1; 634 635 /* number of elements NOT on the diagonal */ 636 nz = adiag[i] - adiag[i+1]-1; 637 638 /* decrement the offset */ 639 offset -= (nz+1); 640 641 /* first, set the diagonal elements */ 642 upTriFactor->AA_h[offset] = 1./v[nz]; 643 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 644 } 645 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 646 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 647 } 648 } catch(char *ex) { 649 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 650 } 651 } 652 PetscFunctionReturn(0); 653 } 654 655 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 656 { 657 PetscErrorCode ierr; 658 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 659 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 660 IS isrow = a->row,iscol = a->icol; 661 PetscBool row_identity,col_identity; 662 PetscInt n = A->rmap->n; 663 664 PetscFunctionBegin; 665 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 666 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 667 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 668 669 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 670 cusparseTriFactors->nnz=a->nz; 671 672 A->offloadmask = PETSC_OFFLOAD_BOTH; 673 /* lower triangular indices */ 674 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 675 if (!row_identity && !cusparseTriFactors->rpermIndices) { 676 const PetscInt *r; 677 678 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 679 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 680 cusparseTriFactors->rpermIndices->assign(r, r+n); 681 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 682 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 683 } 684 685 /* upper triangular indices */ 686 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 687 if (!col_identity && !cusparseTriFactors->cpermIndices) { 688 const PetscInt *c; 689 690 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 691 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 692 cusparseTriFactors->cpermIndices->assign(c, c+n); 693 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 694 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 695 } 696 PetscFunctionReturn(0); 697 } 698 699 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 700 { 701 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 702 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 703 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 704 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 705 cusparseStatus_t stat; 706 PetscErrorCode ierr; 707 cudaError_t cerr; 708 PetscInt *AiUp, *AjUp; 709 PetscScalar *AAUp; 710 PetscScalar *AALo; 711 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 712 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 713 const PetscInt *ai = b->i,*aj = b->j,*vj; 714 const MatScalar *aa = b->a,*v; 715 716 PetscFunctionBegin; 717 if (!n) PetscFunctionReturn(0); 718 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 719 try { 720 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 721 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 722 if (!upTriFactor && !loTriFactor) { 723 /* Allocate Space for the upper triangular matrix */ 724 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 725 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 726 727 /* Fill the upper triangular matrix */ 728 AiUp[0]=(PetscInt) 0; 729 AiUp[n]=nzUpper; 730 offset = 0; 731 for (i=0; i<n; i++) { 732 /* set the pointers */ 733 v = aa + ai[i]; 734 vj = aj + ai[i]; 735 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 736 737 /* first, set the diagonal elements */ 738 AjUp[offset] = (PetscInt) i; 739 AAUp[offset] = (MatScalar)1.0/v[nz]; 740 AiUp[i] = offset; 741 AALo[offset] = (MatScalar)1.0/v[nz]; 742 743 offset+=1; 744 if (nz>0) { 745 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 746 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 747 for (j=offset; j<offset+nz; j++) { 748 AAUp[j] = -AAUp[j]; 749 AALo[j] = AAUp[j]/v[nz]; 750 } 751 offset+=nz; 752 } 753 } 754 755 /* allocate space for the triangular factor information */ 756 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 757 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 758 759 /* Create the matrix description */ 760 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 761 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 762 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 763 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 764 #else 765 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 766 #endif 767 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 768 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 769 770 /* set the matrix */ 771 upTriFactor->csrMat = new CsrMatrix; 772 upTriFactor->csrMat->num_rows = A->rmap->n; 773 upTriFactor->csrMat->num_cols = A->cmap->n; 774 upTriFactor->csrMat->num_entries = a->nz; 775 776 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 777 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 778 779 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 780 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 781 782 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 783 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 784 785 /* set the operation */ 786 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 787 788 /* Create the solve analysis information */ 789 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 790 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 791 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 792 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 793 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 794 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 795 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 796 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 797 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 798 #endif 799 800 /* perform the solve analysis */ 801 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 802 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 803 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 804 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 805 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 806 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 807 #endif 808 );CHKERRCUSPARSE(stat); 809 cerr = WaitForCUDA();CHKERRCUDA(cerr); 810 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 811 812 /* assign the pointer */ 813 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 814 815 /* allocate space for the triangular factor information */ 816 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 817 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 818 819 /* Create the matrix description */ 820 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 821 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 822 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 823 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 824 #else 825 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 826 #endif 827 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 828 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 829 830 /* set the operation */ 831 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 832 833 /* set the matrix */ 834 loTriFactor->csrMat = new CsrMatrix; 835 loTriFactor->csrMat->num_rows = A->rmap->n; 836 loTriFactor->csrMat->num_cols = A->cmap->n; 837 loTriFactor->csrMat->num_entries = a->nz; 838 839 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 840 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 841 842 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 843 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 844 845 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 846 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 847 848 /* Create the solve analysis information */ 849 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 850 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 851 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 852 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 853 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 854 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 855 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 856 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 857 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 858 #endif 859 860 /* perform the solve analysis */ 861 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 862 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 863 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 864 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 865 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 866 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 867 #endif 868 );CHKERRCUSPARSE(stat); 869 cerr = WaitForCUDA();CHKERRCUDA(cerr); 870 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 871 872 /* assign the pointer */ 873 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 874 875 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 876 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 877 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 878 } else { 879 /* Fill the upper triangular matrix */ 880 offset = 0; 881 for (i=0; i<n; i++) { 882 /* set the pointers */ 883 v = aa + ai[i]; 884 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 885 886 /* first, set the diagonal elements */ 887 AAUp[offset] = 1.0/v[nz]; 888 AALo[offset] = 1.0/v[nz]; 889 890 offset+=1; 891 if (nz>0) { 892 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 893 for (j=offset; j<offset+nz; j++) { 894 AAUp[j] = -AAUp[j]; 895 AALo[j] = AAUp[j]/v[nz]; 896 } 897 offset+=nz; 898 } 899 } 900 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 901 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 902 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 903 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 904 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 905 } 906 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 907 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 908 } catch(char *ex) { 909 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 910 } 911 } 912 PetscFunctionReturn(0); 913 } 914 915 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 916 { 917 PetscErrorCode ierr; 918 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 919 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 920 IS ip = a->row; 921 PetscBool perm_identity; 922 PetscInt n = A->rmap->n; 923 924 PetscFunctionBegin; 925 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 926 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 927 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 928 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 929 930 A->offloadmask = PETSC_OFFLOAD_BOTH; 931 932 /* lower triangular indices */ 933 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 934 if (!perm_identity) { 935 IS iip; 936 const PetscInt *irip,*rip; 937 938 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 939 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 940 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 941 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 942 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 943 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 944 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 945 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 946 ierr = ISDestroy(&iip);CHKERRQ(ierr); 947 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 948 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 949 } 950 PetscFunctionReturn(0); 951 } 952 953 #define CHECK_LAUNCH_ERROR() \ 954 do { \ 955 /* Check synchronous errors, i.e. pre-launch */ \ 956 cudaError_t err = cudaGetLastError(); \ 957 if (cudaSuccess != err) { \ 958 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \ 959 } \ 960 /* Check asynchronous errors, i.e. kernel failed (ULF) */ \ 961 err = cudaDeviceSynchronize(); \ 962 if (cudaSuccess != err) { \ 963 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \ 964 } \ 965 } while (0) 966 967 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 968 { 969 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 970 IS ip = b->row; 971 PetscBool perm_identity; 972 PetscErrorCode ierr; 973 974 PetscFunctionBegin; 975 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 976 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 977 B->offloadmask = PETSC_OFFLOAD_CPU; 978 /* determine which version of MatSolve needs to be used. */ 979 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 980 if (perm_identity) { 981 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 982 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 983 B->ops->matsolve = NULL; 984 B->ops->matsolvetranspose = NULL; 985 } else { 986 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 987 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 988 B->ops->matsolve = NULL; 989 B->ops->matsolvetranspose = NULL; 990 } 991 992 /* get the triangular factors */ 993 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 994 PetscFunctionReturn(0); 995 } 996 997 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 998 { 999 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1000 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1001 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1002 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1003 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1004 cusparseStatus_t stat; 1005 cusparseIndexBase_t indexBase; 1006 cusparseMatrixType_t matrixType; 1007 cusparseFillMode_t fillMode; 1008 cusparseDiagType_t diagType; 1009 cudaError_t cerr; 1010 PetscErrorCode ierr; 1011 1012 PetscFunctionBegin; 1013 /* allocate space for the transpose of the lower triangular factor */ 1014 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1015 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1016 1017 /* set the matrix descriptors of the lower triangular factor */ 1018 matrixType = cusparseGetMatType(loTriFactor->descr); 1019 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1020 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1021 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1022 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1023 1024 /* Create the matrix description */ 1025 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1026 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1027 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1028 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1029 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1030 1031 /* set the operation */ 1032 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1033 1034 /* allocate GPU space for the CSC of the lower triangular factor*/ 1035 loTriFactorT->csrMat = new CsrMatrix; 1036 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1037 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1038 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1039 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1040 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1041 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1042 1043 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1044 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1045 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1046 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1047 loTriFactor->csrMat->values->data().get(), 1048 loTriFactor->csrMat->row_offsets->data().get(), 1049 loTriFactor->csrMat->column_indices->data().get(), 1050 loTriFactorT->csrMat->values->data().get(), 1051 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1052 CUSPARSE_ACTION_NUMERIC,indexBase, 1053 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1054 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1055 #endif 1056 1057 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1058 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1059 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1060 loTriFactor->csrMat->values->data().get(), 1061 loTriFactor->csrMat->row_offsets->data().get(), 1062 loTriFactor->csrMat->column_indices->data().get(), 1063 loTriFactorT->csrMat->values->data().get(), 1064 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1065 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1066 CUSPARSE_ACTION_NUMERIC, indexBase, 1067 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 1068 #else 1069 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1070 CUSPARSE_ACTION_NUMERIC, indexBase 1071 #endif 1072 );CHKERRCUSPARSE(stat); 1073 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1074 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1075 1076 /* Create the solve analysis information */ 1077 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1078 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1079 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1080 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1081 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1082 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1083 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1084 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1085 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1086 #endif 1087 1088 /* perform the solve analysis */ 1089 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1090 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1091 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1092 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 1093 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1094 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1095 #endif 1096 );CHKERRCUSPARSE(stat); 1097 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1098 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1099 1100 /* assign the pointer */ 1101 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1102 1103 /*********************************************/ 1104 /* Now the Transpose of the Upper Tri Factor */ 1105 /*********************************************/ 1106 1107 /* allocate space for the transpose of the upper triangular factor */ 1108 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1109 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1110 1111 /* set the matrix descriptors of the upper triangular factor */ 1112 matrixType = cusparseGetMatType(upTriFactor->descr); 1113 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1114 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1115 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1116 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1117 1118 /* Create the matrix description */ 1119 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1120 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1121 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1122 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1123 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1124 1125 /* set the operation */ 1126 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1127 1128 /* allocate GPU space for the CSC of the upper triangular factor*/ 1129 upTriFactorT->csrMat = new CsrMatrix; 1130 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1131 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1132 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1133 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1134 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1135 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1136 1137 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1138 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1139 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1140 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1141 upTriFactor->csrMat->values->data().get(), 1142 upTriFactor->csrMat->row_offsets->data().get(), 1143 upTriFactor->csrMat->column_indices->data().get(), 1144 upTriFactorT->csrMat->values->data().get(), 1145 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1146 CUSPARSE_ACTION_NUMERIC,indexBase, 1147 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1148 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1149 #endif 1150 1151 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1152 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1153 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1154 upTriFactor->csrMat->values->data().get(), 1155 upTriFactor->csrMat->row_offsets->data().get(), 1156 upTriFactor->csrMat->column_indices->data().get(), 1157 upTriFactorT->csrMat->values->data().get(), 1158 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1159 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1160 CUSPARSE_ACTION_NUMERIC, indexBase, 1161 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1162 #else 1163 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1164 CUSPARSE_ACTION_NUMERIC, indexBase 1165 #endif 1166 );CHKERRCUSPARSE(stat); 1167 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1168 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1169 1170 /* Create the solve analysis information */ 1171 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1172 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1173 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1174 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1175 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1176 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1177 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1178 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1179 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1180 #endif 1181 1182 /* perform the solve analysis */ 1183 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1184 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1185 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1186 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1187 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1188 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1189 #endif 1190 );CHKERRCUSPARSE(stat); 1191 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1192 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1193 1194 /* assign the pointer */ 1195 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1196 PetscFunctionReturn(0); 1197 } 1198 1199 struct PetscScalarToPetscInt 1200 { 1201 __host__ __device__ 1202 PetscInt operator()(PetscScalar s) 1203 { 1204 return (PetscInt)PetscRealPart(s); 1205 } 1206 }; 1207 1208 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A) 1209 { 1210 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1211 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1212 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1213 cusparseStatus_t stat; 1214 cusparseIndexBase_t indexBase; 1215 cudaError_t err; 1216 PetscErrorCode ierr; 1217 1218 PetscFunctionBegin; 1219 if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 1220 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1221 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1222 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct"); 1223 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1224 if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matTranspose struct"); 1225 if (A->transupdated) PetscFunctionReturn(0); 1226 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1227 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1228 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1229 } 1230 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1231 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1232 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1233 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1234 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1235 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1236 1237 /* set alpha and beta */ 1238 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1239 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1240 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1241 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1242 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1243 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1244 1245 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1246 CsrMatrix *matrixT = new CsrMatrix; 1247 matstructT->mat = matrixT; 1248 matrixT->num_rows = A->cmap->n; 1249 matrixT->num_cols = A->rmap->n; 1250 matrixT->num_entries = a->nz; 1251 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1252 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1253 matrixT->values = new THRUSTARRAY(a->nz); 1254 1255 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1256 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1257 1258 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1259 stat = cusparseCreateCsr(&matstructT->matDescr, 1260 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1261 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1262 matrixT->values->data().get(), 1263 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1264 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1265 #endif 1266 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1267 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1268 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1269 #else 1270 CsrMatrix *temp = new CsrMatrix; 1271 CsrMatrix *tempT = new CsrMatrix; 1272 /* First convert HYB to CSR */ 1273 temp->num_rows = A->rmap->n; 1274 temp->num_cols = A->cmap->n; 1275 temp->num_entries = a->nz; 1276 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1277 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1278 temp->values = new THRUSTARRAY(a->nz); 1279 1280 stat = cusparse_hyb2csr(cusparsestruct->handle, 1281 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1282 temp->values->data().get(), 1283 temp->row_offsets->data().get(), 1284 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1285 1286 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1287 tempT->num_rows = A->rmap->n; 1288 tempT->num_cols = A->cmap->n; 1289 tempT->num_entries = a->nz; 1290 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1291 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1292 tempT->values = new THRUSTARRAY(a->nz); 1293 1294 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1295 temp->num_cols, temp->num_entries, 1296 temp->values->data().get(), 1297 temp->row_offsets->data().get(), 1298 temp->column_indices->data().get(), 1299 tempT->values->data().get(), 1300 tempT->column_indices->data().get(), 1301 tempT->row_offsets->data().get(), 1302 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1303 1304 /* Last, convert CSC to HYB */ 1305 cusparseHybMat_t hybMat; 1306 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1307 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1308 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1309 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1310 matstructT->descr, tempT->values->data().get(), 1311 tempT->row_offsets->data().get(), 1312 tempT->column_indices->data().get(), 1313 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1314 1315 /* assign the pointer */ 1316 matstructT->mat = hybMat; 1317 A->transupdated = PETSC_TRUE; 1318 /* delete temporaries */ 1319 if (tempT) { 1320 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1321 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1322 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1323 delete (CsrMatrix*) tempT; 1324 } 1325 if (temp) { 1326 if (temp->values) delete (THRUSTARRAY*) temp->values; 1327 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1328 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1329 delete (CsrMatrix*) temp; 1330 } 1331 #endif 1332 } 1333 } 1334 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1335 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1336 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1337 if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix"); 1338 if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix rows"); 1339 if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix cols"); 1340 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix values"); 1341 if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT"); 1342 if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT rows"); 1343 if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT cols"); 1344 if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT values"); 1345 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1346 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1347 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1348 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1349 } 1350 if (!cusparsestruct->csr2csc_i) { 1351 THRUSTARRAY csr2csc_a(matrix->num_entries); 1352 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1353 1354 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1355 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1356 void *csr2cscBuffer; 1357 size_t csr2cscBufferSize; 1358 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1359 A->cmap->n, matrix->num_entries, 1360 matrix->values->data().get(), 1361 cusparsestruct->rowoffsets_gpu->data().get(), 1362 matrix->column_indices->data().get(), 1363 matrixT->values->data().get(), 1364 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1365 CUSPARSE_ACTION_NUMERIC,indexBase, 1366 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1367 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1368 #endif 1369 1370 if (matrix->num_entries) { 1371 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1372 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1373 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1374 1375 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1376 should be filled with indexBase. So I just take a shortcut here. 1377 */ 1378 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1379 A->cmap->n,matrix->num_entries, 1380 csr2csc_a.data().get(), 1381 cusparsestruct->rowoffsets_gpu->data().get(), 1382 matrix->column_indices->data().get(), 1383 matrixT->values->data().get(), 1384 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1385 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1386 CUSPARSE_ACTION_NUMERIC,indexBase, 1387 cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1388 #else 1389 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1390 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1391 #endif 1392 } else { 1393 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1394 } 1395 1396 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1397 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1398 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1399 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1400 #endif 1401 } 1402 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1403 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1404 matrixT->values->begin())); 1405 } 1406 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1407 /* the compressed row indices is not used for matTranspose */ 1408 matstructT->cprowIndices = NULL; 1409 /* assign the pointer */ 1410 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1411 A->transupdated = PETSC_TRUE; 1412 PetscFunctionReturn(0); 1413 } 1414 1415 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1416 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1417 { 1418 PetscInt n = xx->map->n; 1419 const PetscScalar *barray; 1420 PetscScalar *xarray; 1421 thrust::device_ptr<const PetscScalar> bGPU; 1422 thrust::device_ptr<PetscScalar> xGPU; 1423 cusparseStatus_t stat; 1424 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1425 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1426 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1427 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1428 PetscErrorCode ierr; 1429 cudaError_t cerr; 1430 1431 PetscFunctionBegin; 1432 /* Analyze the matrix and create the transpose ... on the fly */ 1433 if (!loTriFactorT && !upTriFactorT) { 1434 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1435 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1436 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1437 } 1438 1439 /* Get the GPU pointers */ 1440 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1441 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1442 xGPU = thrust::device_pointer_cast(xarray); 1443 bGPU = thrust::device_pointer_cast(barray); 1444 1445 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1446 /* First, reorder with the row permutation */ 1447 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1448 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1449 xGPU); 1450 1451 /* First, solve U */ 1452 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1453 upTriFactorT->csrMat->num_rows, 1454 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1455 upTriFactorT->csrMat->num_entries, 1456 #endif 1457 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1458 upTriFactorT->csrMat->values->data().get(), 1459 upTriFactorT->csrMat->row_offsets->data().get(), 1460 upTriFactorT->csrMat->column_indices->data().get(), 1461 upTriFactorT->solveInfo, 1462 xarray, tempGPU->data().get() 1463 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1464 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1465 #endif 1466 );CHKERRCUSPARSE(stat); 1467 1468 /* Then, solve L */ 1469 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1470 loTriFactorT->csrMat->num_rows, 1471 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1472 loTriFactorT->csrMat->num_entries, 1473 #endif 1474 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1475 loTriFactorT->csrMat->values->data().get(), 1476 loTriFactorT->csrMat->row_offsets->data().get(), 1477 loTriFactorT->csrMat->column_indices->data().get(), 1478 loTriFactorT->solveInfo, 1479 tempGPU->data().get(), xarray 1480 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1481 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1482 #endif 1483 );CHKERRCUSPARSE(stat); 1484 1485 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1486 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1487 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1488 tempGPU->begin()); 1489 1490 /* Copy the temporary to the full solution. */ 1491 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1492 1493 /* restore */ 1494 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1495 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1496 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1497 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1498 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1499 PetscFunctionReturn(0); 1500 } 1501 1502 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1503 { 1504 const PetscScalar *barray; 1505 PetscScalar *xarray; 1506 cusparseStatus_t stat; 1507 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1508 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1509 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1510 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1511 PetscErrorCode ierr; 1512 cudaError_t cerr; 1513 1514 PetscFunctionBegin; 1515 /* Analyze the matrix and create the transpose ... on the fly */ 1516 if (!loTriFactorT && !upTriFactorT) { 1517 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1518 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1519 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1520 } 1521 1522 /* Get the GPU pointers */ 1523 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1524 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1525 1526 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1527 /* First, solve U */ 1528 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1529 upTriFactorT->csrMat->num_rows, 1530 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1531 upTriFactorT->csrMat->num_entries, 1532 #endif 1533 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1534 upTriFactorT->csrMat->values->data().get(), 1535 upTriFactorT->csrMat->row_offsets->data().get(), 1536 upTriFactorT->csrMat->column_indices->data().get(), 1537 upTriFactorT->solveInfo, 1538 barray, tempGPU->data().get() 1539 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1540 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1541 #endif 1542 );CHKERRCUSPARSE(stat); 1543 1544 /* Then, solve L */ 1545 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1546 loTriFactorT->csrMat->num_rows, 1547 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1548 loTriFactorT->csrMat->num_entries, 1549 #endif 1550 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1551 loTriFactorT->csrMat->values->data().get(), 1552 loTriFactorT->csrMat->row_offsets->data().get(), 1553 loTriFactorT->csrMat->column_indices->data().get(), 1554 loTriFactorT->solveInfo, 1555 tempGPU->data().get(), xarray 1556 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1557 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1558 #endif 1559 );CHKERRCUSPARSE(stat); 1560 1561 /* restore */ 1562 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1563 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1564 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1565 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1566 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1567 PetscFunctionReturn(0); 1568 } 1569 1570 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1571 { 1572 const PetscScalar *barray; 1573 PetscScalar *xarray; 1574 thrust::device_ptr<const PetscScalar> bGPU; 1575 thrust::device_ptr<PetscScalar> xGPU; 1576 cusparseStatus_t stat; 1577 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1578 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1579 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1580 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1581 PetscErrorCode ierr; 1582 cudaError_t cerr; 1583 1584 PetscFunctionBegin; 1585 1586 /* Get the GPU pointers */ 1587 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1588 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1589 xGPU = thrust::device_pointer_cast(xarray); 1590 bGPU = thrust::device_pointer_cast(barray); 1591 1592 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1593 /* First, reorder with the row permutation */ 1594 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1595 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1596 tempGPU->begin()); 1597 1598 /* Next, solve L */ 1599 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1600 loTriFactor->csrMat->num_rows, 1601 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1602 loTriFactor->csrMat->num_entries, 1603 #endif 1604 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1605 loTriFactor->csrMat->values->data().get(), 1606 loTriFactor->csrMat->row_offsets->data().get(), 1607 loTriFactor->csrMat->column_indices->data().get(), 1608 loTriFactor->solveInfo, 1609 tempGPU->data().get(), xarray 1610 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1611 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1612 #endif 1613 );CHKERRCUSPARSE(stat); 1614 1615 /* Then, solve U */ 1616 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1617 upTriFactor->csrMat->num_rows, 1618 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1619 upTriFactor->csrMat->num_entries, 1620 #endif 1621 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1622 upTriFactor->csrMat->values->data().get(), 1623 upTriFactor->csrMat->row_offsets->data().get(), 1624 upTriFactor->csrMat->column_indices->data().get(), 1625 upTriFactor->solveInfo, 1626 xarray, tempGPU->data().get() 1627 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1628 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1629 #endif 1630 );CHKERRCUSPARSE(stat); 1631 1632 /* Last, reorder with the column permutation */ 1633 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1634 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1635 xGPU); 1636 1637 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1638 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1639 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1640 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1641 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1642 PetscFunctionReturn(0); 1643 } 1644 1645 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1646 { 1647 const PetscScalar *barray; 1648 PetscScalar *xarray; 1649 cusparseStatus_t stat; 1650 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1651 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1652 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1653 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1654 PetscErrorCode ierr; 1655 cudaError_t cerr; 1656 1657 PetscFunctionBegin; 1658 /* Get the GPU pointers */ 1659 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1660 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1661 1662 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1663 /* First, solve L */ 1664 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1665 loTriFactor->csrMat->num_rows, 1666 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1667 loTriFactor->csrMat->num_entries, 1668 #endif 1669 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1670 loTriFactor->csrMat->values->data().get(), 1671 loTriFactor->csrMat->row_offsets->data().get(), 1672 loTriFactor->csrMat->column_indices->data().get(), 1673 loTriFactor->solveInfo, 1674 barray, tempGPU->data().get() 1675 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1676 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1677 #endif 1678 );CHKERRCUSPARSE(stat); 1679 1680 /* Next, solve U */ 1681 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1682 upTriFactor->csrMat->num_rows, 1683 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1684 upTriFactor->csrMat->num_entries, 1685 #endif 1686 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1687 upTriFactor->csrMat->values->data().get(), 1688 upTriFactor->csrMat->row_offsets->data().get(), 1689 upTriFactor->csrMat->column_indices->data().get(), 1690 upTriFactor->solveInfo, 1691 tempGPU->data().get(), xarray 1692 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1693 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1694 #endif 1695 );CHKERRCUSPARSE(stat); 1696 1697 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1698 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1699 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1700 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1701 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1702 PetscFunctionReturn(0); 1703 } 1704 1705 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1706 { 1707 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1708 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1709 cudaError_t cerr; 1710 PetscErrorCode ierr; 1711 1712 PetscFunctionBegin; 1713 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1714 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1715 1716 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1717 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1718 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1719 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1720 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1721 A->offloadmask = PETSC_OFFLOAD_BOTH; 1722 } 1723 PetscFunctionReturn(0); 1724 } 1725 1726 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1727 { 1728 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1729 PetscErrorCode ierr; 1730 1731 PetscFunctionBegin; 1732 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1733 *array = a->a; 1734 A->offloadmask = PETSC_OFFLOAD_CPU; 1735 PetscFunctionReturn(0); 1736 } 1737 1738 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1739 { 1740 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1741 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1742 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1743 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1744 PetscErrorCode ierr; 1745 cusparseStatus_t stat; 1746 PetscBool both = PETSC_TRUE; 1747 cudaError_t err; 1748 1749 PetscFunctionBegin; 1750 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU"); 1751 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1752 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1753 CsrMatrix *matrix; 1754 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1755 1756 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values"); 1757 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1758 matrix->values->assign(a->a, a->a+a->nz); 1759 err = WaitForCUDA();CHKERRCUDA(err); 1760 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1761 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1762 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1763 } else { 1764 PetscInt nnz; 1765 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1766 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1767 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1768 delete cusparsestruct->workVector; 1769 delete cusparsestruct->rowoffsets_gpu; 1770 cusparsestruct->workVector = NULL; 1771 cusparsestruct->rowoffsets_gpu = NULL; 1772 try { 1773 if (a->compressedrow.use) { 1774 m = a->compressedrow.nrows; 1775 ii = a->compressedrow.i; 1776 ridx = a->compressedrow.rindex; 1777 } else { 1778 m = A->rmap->n; 1779 ii = a->i; 1780 ridx = NULL; 1781 } 1782 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data"); 1783 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data"); 1784 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1785 else nnz = a->nz; 1786 1787 /* create cusparse matrix */ 1788 cusparsestruct->nrows = m; 1789 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1790 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1791 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1792 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1793 1794 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1795 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1796 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1797 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1798 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1799 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1800 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1801 1802 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1803 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1804 /* set the matrix */ 1805 CsrMatrix *mat= new CsrMatrix; 1806 mat->num_rows = m; 1807 mat->num_cols = A->cmap->n; 1808 mat->num_entries = nnz; 1809 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1810 mat->row_offsets->assign(ii, ii + m+1); 1811 1812 mat->column_indices = new THRUSTINTARRAY32(nnz); 1813 mat->column_indices->assign(a->j, a->j+nnz); 1814 1815 mat->values = new THRUSTARRAY(nnz); 1816 if (a->a) mat->values->assign(a->a, a->a+nnz); 1817 1818 /* assign the pointer */ 1819 matstruct->mat = mat; 1820 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1821 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1822 stat = cusparseCreateCsr(&matstruct->matDescr, 1823 mat->num_rows, mat->num_cols, mat->num_entries, 1824 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1825 mat->values->data().get(), 1826 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1827 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1828 } 1829 #endif 1830 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1831 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1832 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1833 #else 1834 CsrMatrix *mat= new CsrMatrix; 1835 mat->num_rows = m; 1836 mat->num_cols = A->cmap->n; 1837 mat->num_entries = nnz; 1838 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1839 mat->row_offsets->assign(ii, ii + m+1); 1840 1841 mat->column_indices = new THRUSTINTARRAY32(nnz); 1842 mat->column_indices->assign(a->j, a->j+nnz); 1843 1844 mat->values = new THRUSTARRAY(nnz); 1845 if (a->a) mat->values->assign(a->a, a->a+nnz); 1846 1847 cusparseHybMat_t hybMat; 1848 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1849 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1850 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1851 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1852 matstruct->descr, mat->values->data().get(), 1853 mat->row_offsets->data().get(), 1854 mat->column_indices->data().get(), 1855 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1856 /* assign the pointer */ 1857 matstruct->mat = hybMat; 1858 1859 if (mat) { 1860 if (mat->values) delete (THRUSTARRAY*)mat->values; 1861 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1862 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1863 delete (CsrMatrix*)mat; 1864 } 1865 #endif 1866 } 1867 1868 /* assign the compressed row indices */ 1869 if (a->compressedrow.use) { 1870 cusparsestruct->workVector = new THRUSTARRAY(m); 1871 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1872 matstruct->cprowIndices->assign(ridx,ridx+m); 1873 tmp = m; 1874 } else { 1875 cusparsestruct->workVector = NULL; 1876 matstruct->cprowIndices = NULL; 1877 tmp = 0; 1878 } 1879 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1880 1881 /* assign the pointer */ 1882 cusparsestruct->mat = matstruct; 1883 } catch(char *ex) { 1884 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1885 } 1886 err = WaitForCUDA();CHKERRCUDA(err); 1887 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1888 cusparsestruct->nonzerostate = A->nonzerostate; 1889 } 1890 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1891 } 1892 PetscFunctionReturn(0); 1893 } 1894 1895 struct VecCUDAPlusEquals 1896 { 1897 template <typename Tuple> 1898 __host__ __device__ 1899 void operator()(Tuple t) 1900 { 1901 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1902 } 1903 }; 1904 1905 struct VecCUDAEquals 1906 { 1907 template <typename Tuple> 1908 __host__ __device__ 1909 void operator()(Tuple t) 1910 { 1911 thrust::get<1>(t) = thrust::get<0>(t); 1912 } 1913 }; 1914 1915 struct VecCUDAEqualsReverse 1916 { 1917 template <typename Tuple> 1918 __host__ __device__ 1919 void operator()(Tuple t) 1920 { 1921 thrust::get<0>(t) = thrust::get<1>(t); 1922 } 1923 }; 1924 1925 struct MatMatCusparse { 1926 PetscBool cisdense; 1927 PetscScalar *Bt; 1928 Mat X; 1929 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1930 PetscLogDouble flops; 1931 CsrMatrix *Bcsr; 1932 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1933 cusparseSpMatDescr_t matSpBDescr; 1934 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1935 cusparseDnMatDescr_t matBDescr; 1936 cusparseDnMatDescr_t matCDescr; 1937 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1938 size_t mmBufferSize; 1939 void *mmBuffer; 1940 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1941 cusparseSpGEMMDescr_t spgemmDesc; 1942 #endif 1943 }; 1944 1945 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1946 { 1947 PetscErrorCode ierr; 1948 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1949 cudaError_t cerr; 1950 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1951 cusparseStatus_t stat; 1952 #endif 1953 1954 PetscFunctionBegin; 1955 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1956 delete mmdata->Bcsr; 1957 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1958 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1959 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1960 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1961 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1962 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1963 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1964 #endif 1965 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1966 ierr = PetscFree(data);CHKERRQ(ierr); 1967 PetscFunctionReturn(0); 1968 } 1969 1970 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1971 1972 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1973 { 1974 Mat_Product *product = C->product; 1975 Mat A,B; 1976 PetscInt m,n,blda,clda; 1977 PetscBool flg,biscuda; 1978 Mat_SeqAIJCUSPARSE *cusp; 1979 cusparseStatus_t stat; 1980 cusparseOperation_t opA; 1981 const PetscScalar *barray; 1982 PetscScalar *carray; 1983 PetscErrorCode ierr; 1984 MatMatCusparse *mmdata; 1985 Mat_SeqAIJCUSPARSEMultStruct *mat; 1986 CsrMatrix *csrmat; 1987 cudaError_t cerr; 1988 1989 PetscFunctionBegin; 1990 MatCheckProduct(C,1); 1991 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1992 mmdata = (MatMatCusparse*)product->data; 1993 A = product->A; 1994 B = product->B; 1995 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1996 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1997 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1998 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1999 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2000 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2001 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2002 switch (product->type) { 2003 case MATPRODUCT_AB: 2004 case MATPRODUCT_PtAP: 2005 mat = cusp->mat; 2006 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2007 m = A->rmap->n; 2008 n = B->cmap->n; 2009 break; 2010 case MATPRODUCT_AtB: 2011 if (!A->form_explicit_transpose) { 2012 mat = cusp->mat; 2013 opA = CUSPARSE_OPERATION_TRANSPOSE; 2014 } else { 2015 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2016 mat = cusp->matTranspose; 2017 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2018 } 2019 m = A->cmap->n; 2020 n = B->cmap->n; 2021 break; 2022 case MATPRODUCT_ABt: 2023 case MATPRODUCT_RARt: 2024 mat = cusp->mat; 2025 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2026 m = A->rmap->n; 2027 n = B->rmap->n; 2028 break; 2029 default: 2030 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2031 } 2032 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2033 csrmat = (CsrMatrix*)mat->mat; 2034 /* if the user passed a CPU matrix, copy the data to the GPU */ 2035 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2036 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2037 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2038 2039 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2040 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2041 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2042 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2043 } else { 2044 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2045 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2046 } 2047 2048 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2049 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2050 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2051 /* (re)allcoate mmBuffer if not initialized or LDAs are different */ 2052 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2053 size_t mmBufferSize; 2054 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2055 if (!mmdata->matBDescr) { 2056 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2057 mmdata->Blda = blda; 2058 } 2059 2060 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2061 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2062 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2063 mmdata->Clda = clda; 2064 } 2065 2066 if (!mat->matDescr) { 2067 stat = cusparseCreateCsr(&mat->matDescr, 2068 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2069 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2070 csrmat->values->data().get(), 2071 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2072 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2073 } 2074 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2075 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2076 mmdata->matCDescr,cusparse_scalartype, 2077 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2078 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2079 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2080 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2081 mmdata->mmBufferSize = mmBufferSize; 2082 } 2083 mmdata->initialized = PETSC_TRUE; 2084 } else { 2085 /* to be safe, always update pointers of the mats */ 2086 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2087 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2088 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2089 } 2090 2091 /* do cusparseSpMM, which supports transpose on B */ 2092 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2093 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2094 mmdata->matCDescr,cusparse_scalartype, 2095 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2096 #else 2097 PetscInt k; 2098 /* cusparseXcsrmm does not support transpose on B */ 2099 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2100 cublasHandle_t cublasv2handle; 2101 cublasStatus_t cerr; 2102 2103 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2104 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2105 B->cmap->n,B->rmap->n, 2106 &PETSC_CUSPARSE_ONE ,barray,blda, 2107 &PETSC_CUSPARSE_ZERO,barray,blda, 2108 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2109 blda = B->cmap->n; 2110 k = B->cmap->n; 2111 } else { 2112 k = B->rmap->n; 2113 } 2114 2115 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2116 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2117 csrmat->num_entries,mat->alpha_one,mat->descr, 2118 csrmat->values->data().get(), 2119 csrmat->row_offsets->data().get(), 2120 csrmat->column_indices->data().get(), 2121 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2122 carray,clda);CHKERRCUSPARSE(stat); 2123 #endif 2124 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2125 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2126 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2127 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2128 if (product->type == MATPRODUCT_RARt) { 2129 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2130 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2131 } else if (product->type == MATPRODUCT_PtAP) { 2132 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2133 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2134 } else { 2135 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2136 } 2137 if (mmdata->cisdense) { 2138 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2139 } 2140 if (!biscuda) { 2141 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2142 } 2143 PetscFunctionReturn(0); 2144 } 2145 2146 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2147 { 2148 Mat_Product *product = C->product; 2149 Mat A,B; 2150 PetscInt m,n; 2151 PetscBool cisdense,flg; 2152 PetscErrorCode ierr; 2153 MatMatCusparse *mmdata; 2154 Mat_SeqAIJCUSPARSE *cusp; 2155 2156 PetscFunctionBegin; 2157 MatCheckProduct(C,1); 2158 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2159 A = product->A; 2160 B = product->B; 2161 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2162 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2163 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2164 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2165 switch (product->type) { 2166 case MATPRODUCT_AB: 2167 m = A->rmap->n; 2168 n = B->cmap->n; 2169 break; 2170 case MATPRODUCT_AtB: 2171 m = A->cmap->n; 2172 n = B->cmap->n; 2173 break; 2174 case MATPRODUCT_ABt: 2175 m = A->rmap->n; 2176 n = B->rmap->n; 2177 break; 2178 case MATPRODUCT_PtAP: 2179 m = B->cmap->n; 2180 n = B->cmap->n; 2181 break; 2182 case MATPRODUCT_RARt: 2183 m = B->rmap->n; 2184 n = B->rmap->n; 2185 break; 2186 default: 2187 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2188 } 2189 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2190 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2191 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2192 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2193 2194 /* product data */ 2195 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2196 mmdata->cisdense = cisdense; 2197 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2198 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2199 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2200 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2201 } 2202 #endif 2203 /* for these products we need intermediate storage */ 2204 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2205 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2206 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2207 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2208 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2209 } else { 2210 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2211 } 2212 } 2213 C->product->data = mmdata; 2214 C->product->destroy = MatDestroy_MatMatCusparse; 2215 2216 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2217 PetscFunctionReturn(0); 2218 } 2219 2220 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2221 { 2222 Mat_Product *product = C->product; 2223 Mat A,B; 2224 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2225 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2226 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2227 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2228 PetscBool flg; 2229 PetscErrorCode ierr; 2230 cusparseStatus_t stat; 2231 cudaError_t cerr; 2232 MatProductType ptype; 2233 MatMatCusparse *mmdata; 2234 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2235 cusparseSpMatDescr_t BmatSpDescr; 2236 #endif 2237 2238 PetscFunctionBegin; 2239 MatCheckProduct(C,1); 2240 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 2241 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2242 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name); 2243 mmdata = (MatMatCusparse*)C->product->data; 2244 A = product->A; 2245 B = product->B; 2246 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2247 mmdata->reusesym = PETSC_FALSE; 2248 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2249 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2250 Cmat = Ccusp->mat; 2251 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2252 Ccsr = (CsrMatrix*)Cmat->mat; 2253 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2254 goto finalize; 2255 } 2256 if (!c->nz) goto finalize; 2257 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2258 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2259 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2260 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2261 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2262 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2263 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2264 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2265 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2266 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2267 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2268 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2269 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2270 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2271 2272 ptype = product->type; 2273 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2274 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2275 switch (ptype) { 2276 case MATPRODUCT_AB: 2277 Amat = Acusp->mat; 2278 Bmat = Bcusp->mat; 2279 break; 2280 case MATPRODUCT_AtB: 2281 Amat = Acusp->matTranspose; 2282 Bmat = Bcusp->mat; 2283 break; 2284 case MATPRODUCT_ABt: 2285 Amat = Acusp->mat; 2286 Bmat = Bcusp->matTranspose; 2287 break; 2288 default: 2289 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2290 } 2291 Cmat = Ccusp->mat; 2292 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2293 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2294 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2295 Acsr = (CsrMatrix*)Amat->mat; 2296 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2297 Ccsr = (CsrMatrix*)Cmat->mat; 2298 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2299 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2300 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2301 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2302 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2303 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2304 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2305 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2306 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2307 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2308 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2309 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2310 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2311 #else 2312 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2313 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2314 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2315 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2316 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2317 #endif 2318 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2319 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2320 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2321 C->offloadmask = PETSC_OFFLOAD_GPU; 2322 finalize: 2323 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2324 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2325 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2326 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2327 c->reallocs = 0; 2328 C->info.mallocs += 0; 2329 C->info.nz_unneeded = 0; 2330 C->assembled = C->was_assembled = PETSC_TRUE; 2331 C->num_ass++; 2332 PetscFunctionReturn(0); 2333 } 2334 2335 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2336 { 2337 Mat_Product *product = C->product; 2338 Mat A,B; 2339 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2340 Mat_SeqAIJ *a,*b,*c; 2341 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2342 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2343 PetscInt i,j,m,n,k; 2344 PetscBool flg; 2345 PetscErrorCode ierr; 2346 cusparseStatus_t stat; 2347 cudaError_t cerr; 2348 MatProductType ptype; 2349 MatMatCusparse *mmdata; 2350 PetscLogDouble flops; 2351 PetscBool biscompressed,ciscompressed; 2352 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2353 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2354 size_t bufSize2; 2355 cusparseSpMatDescr_t BmatSpDescr; 2356 #else 2357 int cnz; 2358 #endif 2359 2360 PetscFunctionBegin; 2361 MatCheckProduct(C,1); 2362 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2363 A = product->A; 2364 B = product->B; 2365 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2366 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2367 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2368 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2369 a = (Mat_SeqAIJ*)A->data; 2370 b = (Mat_SeqAIJ*)B->data; 2371 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2372 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2373 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2374 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2375 2376 /* product data */ 2377 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2378 C->product->data = mmdata; 2379 C->product->destroy = MatDestroy_MatMatCusparse; 2380 2381 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2382 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2383 ptype = product->type; 2384 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2385 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2386 biscompressed = PETSC_FALSE; 2387 ciscompressed = PETSC_FALSE; 2388 switch (ptype) { 2389 case MATPRODUCT_AB: 2390 m = A->rmap->n; 2391 n = B->cmap->n; 2392 k = A->cmap->n; 2393 Amat = Acusp->mat; 2394 Bmat = Bcusp->mat; 2395 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2396 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2397 break; 2398 case MATPRODUCT_AtB: 2399 m = A->cmap->n; 2400 n = B->cmap->n; 2401 k = A->rmap->n; 2402 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2403 Amat = Acusp->matTranspose; 2404 Bmat = Bcusp->mat; 2405 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2406 break; 2407 case MATPRODUCT_ABt: 2408 m = A->rmap->n; 2409 n = B->rmap->n; 2410 k = A->cmap->n; 2411 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 2412 Amat = Acusp->mat; 2413 Bmat = Bcusp->matTranspose; 2414 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2415 break; 2416 default: 2417 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2418 } 2419 2420 /* create cusparse matrix */ 2421 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2422 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2423 c = (Mat_SeqAIJ*)C->data; 2424 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2425 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2426 Ccsr = new CsrMatrix; 2427 2428 c->compressedrow.use = ciscompressed; 2429 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2430 c->compressedrow.nrows = a->compressedrow.nrows; 2431 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2432 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2433 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2434 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2435 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2436 } else { 2437 c->compressedrow.nrows = 0; 2438 c->compressedrow.i = NULL; 2439 c->compressedrow.rindex = NULL; 2440 Ccusp->workVector = NULL; 2441 Cmat->cprowIndices = NULL; 2442 } 2443 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2444 Ccusp->mat = Cmat; 2445 Ccusp->mat->mat = Ccsr; 2446 Ccsr->num_rows = Ccusp->nrows; 2447 Ccsr->num_cols = n; 2448 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2449 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2450 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2451 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2452 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2453 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2454 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2455 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2456 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2457 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2458 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2459 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2460 c->nz = 0; 2461 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2462 Ccsr->values = new THRUSTARRAY(c->nz); 2463 goto finalizesym; 2464 } 2465 2466 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2467 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2468 Acsr = (CsrMatrix*)Amat->mat; 2469 if (!biscompressed) { 2470 Bcsr = (CsrMatrix*)Bmat->mat; 2471 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2472 BmatSpDescr = Bmat->matDescr; 2473 #endif 2474 } else { /* we need to use row offsets for the full matrix */ 2475 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2476 Bcsr = new CsrMatrix; 2477 Bcsr->num_rows = B->rmap->n; 2478 Bcsr->num_cols = cBcsr->num_cols; 2479 Bcsr->num_entries = cBcsr->num_entries; 2480 Bcsr->column_indices = cBcsr->column_indices; 2481 Bcsr->values = cBcsr->values; 2482 if (!Bcusp->rowoffsets_gpu) { 2483 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2484 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2485 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2486 } 2487 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2488 mmdata->Bcsr = Bcsr; 2489 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2490 if (Bcsr->num_rows && Bcsr->num_cols) { 2491 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2492 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2493 Bcsr->values->data().get(), 2494 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2495 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2496 } 2497 BmatSpDescr = mmdata->matSpBDescr; 2498 #endif 2499 } 2500 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2501 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2502 /* precompute flops count */ 2503 if (ptype == MATPRODUCT_AB) { 2504 for (i=0, flops = 0; i<A->rmap->n; i++) { 2505 const PetscInt st = a->i[i]; 2506 const PetscInt en = a->i[i+1]; 2507 for (j=st; j<en; j++) { 2508 const PetscInt brow = a->j[j]; 2509 flops += 2.*(b->i[brow+1] - b->i[brow]); 2510 } 2511 } 2512 } else if (ptype == MATPRODUCT_AtB) { 2513 for (i=0, flops = 0; i<A->rmap->n; i++) { 2514 const PetscInt anzi = a->i[i+1] - a->i[i]; 2515 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2516 flops += (2.*anzi)*bnzi; 2517 } 2518 } else { /* TODO */ 2519 flops = 0.; 2520 } 2521 2522 mmdata->flops = flops; 2523 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2524 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2525 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2526 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2527 NULL, NULL, NULL, 2528 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2529 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2530 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2531 /* ask bufferSize bytes for external memory */ 2532 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2533 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2534 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2535 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2536 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2537 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2538 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2539 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2540 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2541 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2542 /* ask bufferSize again bytes for external memory */ 2543 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2544 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2545 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2546 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2547 /* The CUSPARSE documentation is not clear, nor the API 2548 We need both buffers to perform the operations properly! 2549 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2550 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2551 is stored in the descriptor! What a messy API... */ 2552 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2553 /* compute the intermediate product of A * B */ 2554 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2555 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2556 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2557 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2558 /* get matrix C non-zero entries C_nnz1 */ 2559 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2560 c->nz = (PetscInt) C_nnz1; 2561 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2562 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2563 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2564 Ccsr->values = new THRUSTARRAY(c->nz); 2565 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2566 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2567 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2568 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2569 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2570 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2571 #else 2572 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2573 stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2574 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2575 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2576 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2577 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2578 c->nz = cnz; 2579 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2580 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2581 Ccsr->values = new THRUSTARRAY(c->nz); 2582 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2583 2584 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2585 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2586 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2587 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2588 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2589 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2590 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2591 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2592 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2593 #endif 2594 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2595 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2596 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2597 finalizesym: 2598 c->singlemalloc = PETSC_FALSE; 2599 c->free_a = PETSC_TRUE; 2600 c->free_ij = PETSC_TRUE; 2601 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2602 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2603 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2604 PetscInt *d_i = c->i; 2605 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2606 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2607 ii = *Ccsr->row_offsets; 2608 jj = *Ccsr->column_indices; 2609 if (ciscompressed) d_i = c->compressedrow.i; 2610 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2611 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2612 } else { 2613 PetscInt *d_i = c->i; 2614 if (ciscompressed) d_i = c->compressedrow.i; 2615 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2616 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2617 } 2618 if (ciscompressed) { /* need to expand host row offsets */ 2619 PetscInt r = 0; 2620 c->i[0] = 0; 2621 for (k = 0; k < c->compressedrow.nrows; k++) { 2622 const PetscInt next = c->compressedrow.rindex[k]; 2623 const PetscInt old = c->compressedrow.i[k]; 2624 for (; r < next; r++) c->i[r+1] = old; 2625 } 2626 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2627 } 2628 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2629 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2630 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2631 c->maxnz = c->nz; 2632 c->nonzerorowcnt = 0; 2633 c->rmax = 0; 2634 for (k = 0; k < m; k++) { 2635 const PetscInt nn = c->i[k+1] - c->i[k]; 2636 c->ilen[k] = c->imax[k] = nn; 2637 c->nonzerorowcnt += (PetscInt)!!nn; 2638 c->rmax = PetscMax(c->rmax,nn); 2639 } 2640 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2641 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2642 Ccsr->num_entries = c->nz; 2643 2644 C->nonzerostate++; 2645 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2646 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2647 Ccusp->nonzerostate = C->nonzerostate; 2648 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2649 C->preallocated = PETSC_TRUE; 2650 C->assembled = PETSC_FALSE; 2651 C->was_assembled = PETSC_FALSE; 2652 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2653 mmdata->reusesym = PETSC_TRUE; 2654 C->offloadmask = PETSC_OFFLOAD_GPU; 2655 } 2656 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2657 PetscFunctionReturn(0); 2658 } 2659 2660 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2661 2662 /* handles sparse or dense B */ 2663 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2664 { 2665 Mat_Product *product = mat->product; 2666 PetscErrorCode ierr; 2667 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2668 2669 PetscFunctionBegin; 2670 MatCheckProduct(mat,1); 2671 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2672 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2673 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2674 } 2675 if (product->type == MATPRODUCT_ABC) { 2676 Ciscusp = PETSC_FALSE; 2677 if (!product->C->boundtocpu) { 2678 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2679 } 2680 } 2681 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 2682 PetscBool usecpu = PETSC_FALSE; 2683 switch (product->type) { 2684 case MATPRODUCT_AB: 2685 if (product->api_user) { 2686 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 2687 ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2688 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2689 } else { 2690 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 2691 ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2692 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2693 } 2694 break; 2695 case MATPRODUCT_AtB: 2696 if (product->api_user) { 2697 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 2698 ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2699 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2700 } else { 2701 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 2702 ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2703 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2704 } 2705 break; 2706 case MATPRODUCT_PtAP: 2707 if (product->api_user) { 2708 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 2709 ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2710 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2711 } else { 2712 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 2713 ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2714 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2715 } 2716 break; 2717 case MATPRODUCT_RARt: 2718 if (product->api_user) { 2719 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 2720 ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2721 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2722 } else { 2723 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 2724 ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2725 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2726 } 2727 break; 2728 case MATPRODUCT_ABC: 2729 if (product->api_user) { 2730 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 2731 ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2732 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2733 } else { 2734 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 2735 ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2736 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2737 } 2738 break; 2739 default: 2740 break; 2741 } 2742 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 2743 } 2744 /* dispatch */ 2745 if (isdense) { 2746 switch (product->type) { 2747 case MATPRODUCT_AB: 2748 case MATPRODUCT_AtB: 2749 case MATPRODUCT_ABt: 2750 case MATPRODUCT_PtAP: 2751 case MATPRODUCT_RARt: 2752 if (product->A->boundtocpu) { 2753 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2754 } else { 2755 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2756 } 2757 break; 2758 case MATPRODUCT_ABC: 2759 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2760 break; 2761 default: 2762 break; 2763 } 2764 } else if (Biscusp && Ciscusp) { 2765 switch (product->type) { 2766 case MATPRODUCT_AB: 2767 case MATPRODUCT_AtB: 2768 case MATPRODUCT_ABt: 2769 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2770 break; 2771 case MATPRODUCT_PtAP: 2772 case MATPRODUCT_RARt: 2773 case MATPRODUCT_ABC: 2774 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2775 break; 2776 default: 2777 break; 2778 } 2779 } else { /* fallback for AIJ */ 2780 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2781 } 2782 PetscFunctionReturn(0); 2783 } 2784 2785 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2786 { 2787 PetscErrorCode ierr; 2788 2789 PetscFunctionBegin; 2790 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2791 PetscFunctionReturn(0); 2792 } 2793 2794 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2795 { 2796 PetscErrorCode ierr; 2797 2798 PetscFunctionBegin; 2799 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2800 PetscFunctionReturn(0); 2801 } 2802 2803 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2804 { 2805 PetscErrorCode ierr; 2806 2807 PetscFunctionBegin; 2808 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2809 PetscFunctionReturn(0); 2810 } 2811 2812 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2813 { 2814 PetscErrorCode ierr; 2815 2816 PetscFunctionBegin; 2817 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2818 PetscFunctionReturn(0); 2819 } 2820 2821 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2822 { 2823 PetscErrorCode ierr; 2824 2825 PetscFunctionBegin; 2826 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2827 PetscFunctionReturn(0); 2828 } 2829 2830 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2831 { 2832 int i = blockIdx.x*blockDim.x + threadIdx.x; 2833 if (i < n) y[idx[i]] += x[i]; 2834 } 2835 2836 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2837 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2838 { 2839 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2840 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2841 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2842 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2843 PetscErrorCode ierr; 2844 cudaError_t cerr; 2845 cusparseStatus_t stat; 2846 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2847 PetscBool compressed; 2848 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2849 PetscInt nx,ny; 2850 #endif 2851 2852 PetscFunctionBegin; 2853 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2854 if (!a->nonzerorowcnt) { 2855 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2856 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2857 PetscFunctionReturn(0); 2858 } 2859 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2860 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2861 if (!trans) { 2862 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2863 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2864 } else { 2865 if (herm || !A->form_explicit_transpose) { 2866 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2867 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2868 } else { 2869 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);} 2870 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2871 } 2872 } 2873 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2874 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2875 2876 try { 2877 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2878 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2879 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2880 2881 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2882 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2883 /* z = A x + beta y. 2884 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2885 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2886 */ 2887 xptr = xarray; 2888 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2889 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2890 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2891 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2892 allocated to accommodate different uses. So we get the length info directly from mat. 2893 */ 2894 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2895 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2896 nx = mat->num_cols; 2897 ny = mat->num_rows; 2898 } 2899 #endif 2900 } else { 2901 /* z = A^T x + beta y 2902 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2903 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2904 */ 2905 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2906 dptr = zarray; 2907 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2908 if (compressed) { /* Scatter x to work vector */ 2909 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2910 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2911 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2912 VecCUDAEqualsReverse()); 2913 } 2914 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2915 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2916 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2917 nx = mat->num_rows; 2918 ny = mat->num_cols; 2919 } 2920 #endif 2921 } 2922 2923 /* csr_spmv does y = alpha op(A) x + beta y */ 2924 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2925 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2926 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2927 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2928 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2929 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2930 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2931 matstruct->matDescr, 2932 matstruct->cuSpMV[opA].vecXDescr, beta, 2933 matstruct->cuSpMV[opA].vecYDescr, 2934 cusparse_scalartype, 2935 cusparsestruct->spmvAlg, 2936 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2937 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2938 2939 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2940 } else { 2941 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2942 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2943 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2944 } 2945 2946 stat = cusparseSpMV(cusparsestruct->handle, opA, 2947 matstruct->alpha_one, 2948 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */ 2949 matstruct->cuSpMV[opA].vecXDescr, 2950 beta, 2951 matstruct->cuSpMV[opA].vecYDescr, 2952 cusparse_scalartype, 2953 cusparsestruct->spmvAlg, 2954 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2955 #else 2956 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2957 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2958 mat->num_rows, mat->num_cols, 2959 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2960 mat->values->data().get(), mat->row_offsets->data().get(), 2961 mat->column_indices->data().get(), xptr, beta, 2962 dptr);CHKERRCUSPARSE(stat); 2963 #endif 2964 } else { 2965 if (cusparsestruct->nrows) { 2966 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2967 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2968 #else 2969 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2970 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2971 matstruct->alpha_one, matstruct->descr, hybMat, 2972 xptr, beta, 2973 dptr);CHKERRCUSPARSE(stat); 2974 #endif 2975 } 2976 } 2977 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2978 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2979 2980 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2981 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2982 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2983 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2984 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2985 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2986 } 2987 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2988 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2989 } 2990 2991 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2992 if (compressed) { 2993 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2994 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 2995 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 2996 prevent that. So I just add a ScatterAdd kernel. 2997 */ 2998 #if 0 2999 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3000 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3001 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3002 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3003 VecCUDAPlusEquals()); 3004 #else 3005 PetscInt n = matstruct->cprowIndices->size(); 3006 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3007 #endif 3008 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3009 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3010 } 3011 } else { 3012 if (yy && yy != zz) { 3013 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3014 } 3015 } 3016 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3017 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 3018 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 3019 } catch(char *ex) { 3020 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3021 } 3022 if (yy) { 3023 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 3024 } else { 3025 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 3026 } 3027 PetscFunctionReturn(0); 3028 } 3029 3030 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3031 { 3032 PetscErrorCode ierr; 3033 3034 PetscFunctionBegin; 3035 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3036 PetscFunctionReturn(0); 3037 } 3038 3039 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3040 { 3041 PetscErrorCode ierr; 3042 PetscSplitCSRDataStructure *d_mat = NULL; 3043 PetscFunctionBegin; 3044 if (A->factortype == MAT_FACTOR_NONE) { 3045 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 3046 } 3047 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 3048 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 3049 if (d_mat) { 3050 A->offloadmask = PETSC_OFFLOAD_GPU; 3051 } 3052 3053 PetscFunctionReturn(0); 3054 } 3055 3056 /* --------------------------------------------------------------------------------*/ 3057 /*@ 3058 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3059 (the default parallel PETSc format). This matrix will ultimately pushed down 3060 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3061 assembly performance the user should preallocate the matrix storage by setting 3062 the parameter nz (or the array nnz). By setting these parameters accurately, 3063 performance during matrix assembly can be increased by more than a factor of 50. 3064 3065 Collective 3066 3067 Input Parameters: 3068 + comm - MPI communicator, set to PETSC_COMM_SELF 3069 . m - number of rows 3070 . n - number of columns 3071 . nz - number of nonzeros per row (same for all rows) 3072 - nnz - array containing the number of nonzeros in the various rows 3073 (possibly different for each row) or NULL 3074 3075 Output Parameter: 3076 . A - the matrix 3077 3078 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3079 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3080 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3081 3082 Notes: 3083 If nnz is given then nz is ignored 3084 3085 The AIJ format (also called the Yale sparse matrix format or 3086 compressed row storage), is fully compatible with standard Fortran 77 3087 storage. That is, the stored row and column indices can begin at 3088 either one (as in Fortran) or zero. See the users' manual for details. 3089 3090 Specify the preallocated storage with either nz or nnz (not both). 3091 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3092 allocation. For large problems you MUST preallocate memory or you 3093 will get TERRIBLE performance, see the users' manual chapter on matrices. 3094 3095 By default, this format uses inodes (identical nodes) when possible, to 3096 improve numerical efficiency of matrix-vector products and solves. We 3097 search for consecutive rows with the same nonzero structure, thereby 3098 reusing matrix information to achieve increased efficiency. 3099 3100 Level: intermediate 3101 3102 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3103 @*/ 3104 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3105 { 3106 PetscErrorCode ierr; 3107 3108 PetscFunctionBegin; 3109 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3110 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3111 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3112 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3113 PetscFunctionReturn(0); 3114 } 3115 3116 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3117 { 3118 PetscErrorCode ierr; 3119 PetscSplitCSRDataStructure *d_mat = NULL; 3120 3121 PetscFunctionBegin; 3122 if (A->factortype == MAT_FACTOR_NONE) { 3123 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 3124 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 3125 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3126 } else { 3127 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3128 } 3129 if (d_mat) { 3130 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3131 cudaError_t err; 3132 PetscSplitCSRDataStructure h_mat; 3133 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 3134 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 3135 if (a->compressedrow.use) { 3136 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 3137 } 3138 err = cudaFree(d_mat);CHKERRCUDA(err); 3139 } 3140 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3141 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3142 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3143 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3144 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3145 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3146 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3147 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3148 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3149 PetscFunctionReturn(0); 3150 } 3151 3152 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3153 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3154 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3155 { 3156 PetscErrorCode ierr; 3157 3158 PetscFunctionBegin; 3159 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3160 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3161 PetscFunctionReturn(0); 3162 } 3163 3164 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3165 { 3166 PetscErrorCode ierr; 3167 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3168 Mat_SeqAIJCUSPARSE *cy; 3169 Mat_SeqAIJCUSPARSE *cx; 3170 PetscScalar *ay; 3171 const PetscScalar *ax; 3172 CsrMatrix *csry,*csrx; 3173 cudaError_t cerr; 3174 3175 PetscFunctionBegin; 3176 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3177 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3178 if (X->ops->axpy != Y->ops->axpy) { 3179 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3180 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3181 PetscFunctionReturn(0); 3182 } 3183 /* if we are here, it means both matrices are bound to GPU */ 3184 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3185 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3186 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3187 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3188 csry = (CsrMatrix*)cy->mat->mat; 3189 csrx = (CsrMatrix*)cx->mat->mat; 3190 /* see if we can turn this into a cublas axpy */ 3191 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3192 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3193 if (eq) { 3194 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3195 } 3196 if (eq) str = SAME_NONZERO_PATTERN; 3197 } 3198 /* spgeam is buggy with one column */ 3199 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3200 3201 if (str == SUBSET_NONZERO_PATTERN) { 3202 cusparseStatus_t stat; 3203 PetscScalar b = 1.0; 3204 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3205 size_t bufferSize; 3206 void *buffer; 3207 #endif 3208 3209 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3210 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3211 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3212 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3213 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3214 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3215 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3216 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3217 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3218 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3219 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3220 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3221 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3222 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3223 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3224 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3225 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3226 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3227 #else 3228 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3229 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3230 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3231 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3232 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3233 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3234 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3235 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3236 #endif 3237 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3238 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3239 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3240 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3241 } else if (str == SAME_NONZERO_PATTERN) { 3242 cublasHandle_t cublasv2handle; 3243 cublasStatus_t berr; 3244 PetscBLASInt one = 1, bnz = 1; 3245 3246 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3247 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3248 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3249 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3250 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3251 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3252 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3253 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3254 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3255 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3256 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3257 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3258 } else { 3259 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3260 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3261 } 3262 PetscFunctionReturn(0); 3263 } 3264 3265 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3266 { 3267 PetscErrorCode ierr; 3268 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3269 PetscScalar *ay; 3270 cudaError_t cerr; 3271 cublasHandle_t cublasv2handle; 3272 cublasStatus_t berr; 3273 PetscBLASInt one = 1, bnz = 1; 3274 3275 PetscFunctionBegin; 3276 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3277 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3278 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3279 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3280 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3281 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3282 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3283 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3284 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3285 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3286 PetscFunctionReturn(0); 3287 } 3288 3289 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3290 { 3291 PetscErrorCode ierr; 3292 PetscBool both = PETSC_FALSE; 3293 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3294 3295 PetscFunctionBegin; 3296 if (A->factortype == MAT_FACTOR_NONE) { 3297 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3298 if (spptr->mat) { 3299 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3300 if (matrix->values) { 3301 both = PETSC_TRUE; 3302 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3303 } 3304 } 3305 if (spptr->matTranspose) { 3306 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3307 if (matrix->values) { 3308 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3309 } 3310 } 3311 } 3312 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3313 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3314 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3315 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3316 else A->offloadmask = PETSC_OFFLOAD_CPU; 3317 3318 PetscFunctionReturn(0); 3319 } 3320 3321 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3322 { 3323 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3324 PetscErrorCode ierr; 3325 3326 PetscFunctionBegin; 3327 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3328 if (flg) { 3329 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3330 3331 A->ops->scale = MatScale_SeqAIJ; 3332 A->ops->axpy = MatAXPY_SeqAIJ; 3333 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3334 A->ops->mult = MatMult_SeqAIJ; 3335 A->ops->multadd = MatMultAdd_SeqAIJ; 3336 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3337 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3338 A->ops->multhermitiantranspose = NULL; 3339 A->ops->multhermitiantransposeadd = NULL; 3340 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3341 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3342 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3343 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3344 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3345 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3346 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3347 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3348 } else { 3349 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3350 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3351 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3352 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3353 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3354 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3355 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3356 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3357 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3358 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3359 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3360 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3361 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3362 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3363 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3364 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3365 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3366 } 3367 A->boundtocpu = flg; 3368 a->inode.use = flg; 3369 PetscFunctionReturn(0); 3370 } 3371 3372 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3373 { 3374 PetscErrorCode ierr; 3375 cusparseStatus_t stat; 3376 Mat B; 3377 3378 PetscFunctionBegin; 3379 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3380 if (reuse == MAT_INITIAL_MATRIX) { 3381 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3382 } else if (reuse == MAT_REUSE_MATRIX) { 3383 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3384 } 3385 B = *newmat; 3386 3387 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3388 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3389 3390 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3391 if (B->factortype == MAT_FACTOR_NONE) { 3392 Mat_SeqAIJCUSPARSE *spptr; 3393 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3394 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3395 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3396 spptr->format = MAT_CUSPARSE_CSR; 3397 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3398 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3399 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3400 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3401 #endif 3402 B->spptr = spptr; 3403 } else { 3404 Mat_SeqAIJCUSPARSETriFactors *spptr; 3405 3406 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3407 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3408 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3409 B->spptr = spptr; 3410 } 3411 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3412 } 3413 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3414 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3415 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3416 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3417 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3418 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3419 3420 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3421 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3422 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3423 PetscFunctionReturn(0); 3424 } 3425 3426 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3427 { 3428 PetscErrorCode ierr; 3429 3430 PetscFunctionBegin; 3431 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3432 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3433 PetscFunctionReturn(0); 3434 } 3435 3436 /*MC 3437 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3438 3439 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3440 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3441 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3442 3443 Options Database Keys: 3444 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3445 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3446 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3447 3448 Level: beginner 3449 3450 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3451 M*/ 3452 3453 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3454 3455 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3456 { 3457 PetscErrorCode ierr; 3458 3459 PetscFunctionBegin; 3460 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 3461 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3462 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3463 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3464 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3465 3466 PetscFunctionReturn(0); 3467 } 3468 3469 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3470 { 3471 PetscErrorCode ierr; 3472 cusparseStatus_t stat; 3473 3474 PetscFunctionBegin; 3475 if (*cusparsestruct) { 3476 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3477 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3478 delete (*cusparsestruct)->workVector; 3479 delete (*cusparsestruct)->rowoffsets_gpu; 3480 delete (*cusparsestruct)->cooPerm; 3481 delete (*cusparsestruct)->cooPerm_a; 3482 delete (*cusparsestruct)->csr2csc_i; 3483 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3484 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3485 } 3486 PetscFunctionReturn(0); 3487 } 3488 3489 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3490 { 3491 PetscFunctionBegin; 3492 if (*mat) { 3493 delete (*mat)->values; 3494 delete (*mat)->column_indices; 3495 delete (*mat)->row_offsets; 3496 delete *mat; 3497 *mat = 0; 3498 } 3499 PetscFunctionReturn(0); 3500 } 3501 3502 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3503 { 3504 cusparseStatus_t stat; 3505 PetscErrorCode ierr; 3506 3507 PetscFunctionBegin; 3508 if (*trifactor) { 3509 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3510 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3511 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3512 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3513 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3514 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3515 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3516 #endif 3517 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3518 } 3519 PetscFunctionReturn(0); 3520 } 3521 3522 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3523 { 3524 CsrMatrix *mat; 3525 cusparseStatus_t stat; 3526 cudaError_t err; 3527 3528 PetscFunctionBegin; 3529 if (*matstruct) { 3530 if ((*matstruct)->mat) { 3531 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3532 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3533 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3534 #else 3535 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3536 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3537 #endif 3538 } else { 3539 mat = (CsrMatrix*)(*matstruct)->mat; 3540 CsrMatrix_Destroy(&mat); 3541 } 3542 } 3543 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3544 delete (*matstruct)->cprowIndices; 3545 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3546 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3547 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3548 3549 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3550 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3551 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3552 for (int i=0; i<3; i++) { 3553 if (mdata->cuSpMV[i].initialized) { 3554 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3555 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3556 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3557 } 3558 } 3559 #endif 3560 delete *matstruct; 3561 *matstruct = NULL; 3562 } 3563 PetscFunctionReturn(0); 3564 } 3565 3566 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3567 { 3568 PetscErrorCode ierr; 3569 3570 PetscFunctionBegin; 3571 if (*trifactors) { 3572 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3573 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3574 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3575 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3576 delete (*trifactors)->rpermIndices; 3577 delete (*trifactors)->cpermIndices; 3578 delete (*trifactors)->workVector; 3579 (*trifactors)->rpermIndices = NULL; 3580 (*trifactors)->cpermIndices = NULL; 3581 (*trifactors)->workVector = NULL; 3582 if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3583 if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3584 } 3585 PetscFunctionReturn(0); 3586 } 3587 3588 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3589 { 3590 PetscErrorCode ierr; 3591 cusparseHandle_t handle; 3592 cusparseStatus_t stat; 3593 3594 PetscFunctionBegin; 3595 if (*trifactors) { 3596 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3597 if (handle = (*trifactors)->handle) { 3598 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3599 } 3600 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3601 } 3602 PetscFunctionReturn(0); 3603 } 3604 3605 struct IJCompare 3606 { 3607 __host__ __device__ 3608 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3609 { 3610 if (t1.get<0>() < t2.get<0>()) return true; 3611 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3612 return false; 3613 } 3614 }; 3615 3616 struct IJEqual 3617 { 3618 __host__ __device__ 3619 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3620 { 3621 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3622 return true; 3623 } 3624 }; 3625 3626 struct IJDiff 3627 { 3628 __host__ __device__ 3629 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3630 { 3631 return t1 == t2 ? 0 : 1; 3632 } 3633 }; 3634 3635 struct IJSum 3636 { 3637 __host__ __device__ 3638 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3639 { 3640 return t1||t2; 3641 } 3642 }; 3643 3644 #include <thrust/iterator/discard_iterator.h> 3645 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3646 { 3647 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3648 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3649 THRUSTARRAY *cooPerm_v = NULL; 3650 thrust::device_ptr<const PetscScalar> d_v; 3651 CsrMatrix *matrix; 3652 PetscErrorCode ierr; 3653 cudaError_t cerr; 3654 PetscInt n; 3655 3656 PetscFunctionBegin; 3657 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3658 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3659 if (!cusp->cooPerm) { 3660 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3661 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3662 PetscFunctionReturn(0); 3663 } 3664 matrix = (CsrMatrix*)cusp->mat->mat; 3665 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3666 if (!v) { 3667 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3668 goto finalize; 3669 } 3670 n = cusp->cooPerm->size(); 3671 if (isCudaMem(v)) { 3672 d_v = thrust::device_pointer_cast(v); 3673 } else { 3674 cooPerm_v = new THRUSTARRAY(n); 3675 cooPerm_v->assign(v,v+n); 3676 d_v = cooPerm_v->data(); 3677 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3678 } 3679 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3680 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3681 if (cusp->cooPerm_a) { 3682 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3683 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3684 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3685 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3686 delete cooPerm_w; 3687 } else { 3688 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3689 matrix->values->begin())); 3690 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3691 matrix->values->end())); 3692 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 3693 } 3694 } else { 3695 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3696 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3697 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3698 } else { 3699 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3700 matrix->values->begin())); 3701 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3702 matrix->values->end())); 3703 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3704 } 3705 } 3706 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3707 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3708 finalize: 3709 delete cooPerm_v; 3710 A->offloadmask = PETSC_OFFLOAD_GPU; 3711 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3712 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3713 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3714 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3715 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3716 a->reallocs = 0; 3717 A->info.mallocs += 0; 3718 A->info.nz_unneeded = 0; 3719 A->assembled = A->was_assembled = PETSC_TRUE; 3720 A->num_ass++; 3721 PetscFunctionReturn(0); 3722 } 3723 3724 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3725 { 3726 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3727 PetscErrorCode ierr; 3728 3729 PetscFunctionBegin; 3730 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3731 if (!cusp) PetscFunctionReturn(0); 3732 if (destroy) { 3733 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3734 delete cusp->csr2csc_i; 3735 cusp->csr2csc_i = NULL; 3736 } 3737 A->transupdated = PETSC_FALSE; 3738 PetscFunctionReturn(0); 3739 } 3740 3741 #include <thrust/binary_search.h> 3742 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3743 { 3744 PetscErrorCode ierr; 3745 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3746 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3747 PetscInt cooPerm_n, nzr = 0; 3748 cudaError_t cerr; 3749 3750 PetscFunctionBegin; 3751 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3752 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3753 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3754 if (n != cooPerm_n) { 3755 delete cusp->cooPerm; 3756 delete cusp->cooPerm_a; 3757 cusp->cooPerm = NULL; 3758 cusp->cooPerm_a = NULL; 3759 } 3760 if (n) { 3761 THRUSTINTARRAY d_i(n); 3762 THRUSTINTARRAY d_j(n); 3763 THRUSTINTARRAY ii(A->rmap->n); 3764 3765 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3766 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3767 3768 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3769 d_i.assign(coo_i,coo_i+n); 3770 d_j.assign(coo_j,coo_j+n); 3771 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3772 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3773 3774 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3775 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3776 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 3777 *cusp->cooPerm_a = d_i; 3778 THRUSTINTARRAY w = d_j; 3779 3780 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 3781 if (nekey == ekey) { /* all entries are unique */ 3782 delete cusp->cooPerm_a; 3783 cusp->cooPerm_a = NULL; 3784 } else { /* I couldn't come up with a more elegant algorithm */ 3785 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 3786 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 3787 (*cusp->cooPerm_a)[0] = 0; 3788 w[0] = 0; 3789 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 3790 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 3791 } 3792 thrust::counting_iterator<PetscInt> search_begin(0); 3793 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 3794 search_begin, search_begin + A->rmap->n, 3795 ii.begin()); 3796 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3797 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3798 3799 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3800 a->singlemalloc = PETSC_FALSE; 3801 a->free_a = PETSC_TRUE; 3802 a->free_ij = PETSC_TRUE; 3803 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3804 a->i[0] = 0; 3805 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3806 a->nz = a->maxnz = a->i[A->rmap->n]; 3807 a->rmax = 0; 3808 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3809 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3810 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3811 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3812 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3813 for (PetscInt i = 0; i < A->rmap->n; i++) { 3814 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3815 nzr += (PetscInt)!!(nnzr); 3816 a->ilen[i] = a->imax[i] = nnzr; 3817 a->rmax = PetscMax(a->rmax,nnzr); 3818 } 3819 a->nonzerorowcnt = nzr; 3820 A->preallocated = PETSC_TRUE; 3821 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3822 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3823 } else { 3824 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3825 } 3826 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3827 3828 /* We want to allocate the CUSPARSE struct for matvec now. 3829 The code is so convoluted now that I prefer to copy zeros */ 3830 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3831 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3832 A->offloadmask = PETSC_OFFLOAD_CPU; 3833 A->nonzerostate++; 3834 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3835 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 3836 3837 A->assembled = PETSC_FALSE; 3838 A->was_assembled = PETSC_FALSE; 3839 PetscFunctionReturn(0); 3840 } 3841 3842 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 3843 { 3844 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3845 CsrMatrix *csr; 3846 PetscErrorCode ierr; 3847 3848 PetscFunctionBegin; 3849 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3850 PetscValidPointer(a,2); 3851 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3852 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3853 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3854 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3855 csr = (CsrMatrix*)cusp->mat->mat; 3856 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3857 *a = csr->values->data().get(); 3858 PetscFunctionReturn(0); 3859 } 3860 3861 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 3862 { 3863 PetscFunctionBegin; 3864 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3865 PetscValidPointer(a,2); 3866 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3867 *a = NULL; 3868 PetscFunctionReturn(0); 3869 } 3870 3871 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 3872 { 3873 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3874 CsrMatrix *csr; 3875 PetscErrorCode ierr; 3876 3877 PetscFunctionBegin; 3878 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3879 PetscValidPointer(a,2); 3880 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3881 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3882 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3883 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3884 csr = (CsrMatrix*)cusp->mat->mat; 3885 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3886 *a = csr->values->data().get(); 3887 A->offloadmask = PETSC_OFFLOAD_GPU; 3888 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3889 PetscFunctionReturn(0); 3890 } 3891 3892 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 3893 { 3894 PetscErrorCode ierr; 3895 3896 PetscFunctionBegin; 3897 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3898 PetscValidPointer(a,2); 3899 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3900 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3901 *a = NULL; 3902 PetscFunctionReturn(0); 3903 } 3904 3905 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 3906 { 3907 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3908 CsrMatrix *csr; 3909 PetscErrorCode ierr; 3910 3911 PetscFunctionBegin; 3912 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3913 PetscValidPointer(a,2); 3914 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3915 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3916 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3917 csr = (CsrMatrix*)cusp->mat->mat; 3918 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3919 *a = csr->values->data().get(); 3920 A->offloadmask = PETSC_OFFLOAD_GPU; 3921 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3922 PetscFunctionReturn(0); 3923 } 3924 3925 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 3926 { 3927 PetscErrorCode ierr; 3928 3929 PetscFunctionBegin; 3930 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3931 PetscValidPointer(a,2); 3932 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3933 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3934 *a = NULL; 3935 PetscFunctionReturn(0); 3936 } 3937 3938 struct IJCompare4 3939 { 3940 __host__ __device__ 3941 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 3942 { 3943 if (t1.get<0>() < t2.get<0>()) return true; 3944 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3945 return false; 3946 } 3947 }; 3948 3949 struct Shift 3950 { 3951 int _shift; 3952 3953 Shift(int shift) : _shift(shift) {} 3954 __host__ __device__ 3955 inline int operator() (const int &c) 3956 { 3957 return c + _shift; 3958 } 3959 }; 3960 3961 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */ 3962 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 3963 { 3964 PetscErrorCode ierr; 3965 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 3966 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 3967 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 3968 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3969 PetscInt Annz,Bnnz; 3970 cusparseStatus_t stat; 3971 PetscInt i,m,n,zero = 0; 3972 cudaError_t cerr; 3973 3974 PetscFunctionBegin; 3975 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3976 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 3977 PetscValidPointer(C,4); 3978 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3979 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 3980 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 3981 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 3982 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3983 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3984 if (reuse == MAT_INITIAL_MATRIX) { 3985 m = A->rmap->n; 3986 n = A->cmap->n + B->cmap->n; 3987 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 3988 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 3989 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3990 c = (Mat_SeqAIJ*)(*C)->data; 3991 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3992 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3993 Ccsr = new CsrMatrix; 3994 Cmat->cprowIndices = NULL; 3995 c->compressedrow.use = PETSC_FALSE; 3996 c->compressedrow.nrows = 0; 3997 c->compressedrow.i = NULL; 3998 c->compressedrow.rindex = NULL; 3999 Ccusp->workVector = NULL; 4000 Ccusp->nrows = m; 4001 Ccusp->mat = Cmat; 4002 Ccusp->mat->mat = Ccsr; 4003 Ccsr->num_rows = m; 4004 Ccsr->num_cols = n; 4005 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 4006 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4007 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4008 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4009 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4010 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4011 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4012 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4013 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4014 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4015 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4016 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 4017 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 4018 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4019 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4020 4021 Acsr = (CsrMatrix*)Acusp->mat->mat; 4022 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4023 Annz = (PetscInt)Acsr->column_indices->size(); 4024 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4025 c->nz = Annz + Bnnz; 4026 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4027 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4028 Ccsr->values = new THRUSTARRAY(c->nz); 4029 Ccsr->num_entries = c->nz; 4030 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4031 if (c->nz) { 4032 auto Acoo = new THRUSTINTARRAY32(Annz); 4033 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4034 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4035 THRUSTINTARRAY32 *Aroff,*Broff; 4036 4037 if (a->compressedrow.use) { /* need full row offset */ 4038 if (!Acusp->rowoffsets_gpu) { 4039 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4040 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4041 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4042 } 4043 Aroff = Acusp->rowoffsets_gpu; 4044 } else Aroff = Acsr->row_offsets; 4045 if (b->compressedrow.use) { /* need full row offset */ 4046 if (!Bcusp->rowoffsets_gpu) { 4047 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4048 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4049 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4050 } 4051 Broff = Bcusp->rowoffsets_gpu; 4052 } else Broff = Bcsr->row_offsets; 4053 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4054 stat = cusparseXcsr2coo(Acusp->handle, 4055 Aroff->data().get(), 4056 Annz, 4057 m, 4058 Acoo->data().get(), 4059 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4060 stat = cusparseXcsr2coo(Bcusp->handle, 4061 Broff->data().get(), 4062 Bnnz, 4063 m, 4064 Bcoo->data().get(), 4065 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4066 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4067 auto Aperm = thrust::make_constant_iterator(1); 4068 auto Bperm = thrust::make_constant_iterator(0); 4069 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4070 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4071 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4072 #else 4073 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4074 auto Bcib = Bcsr->column_indices->begin(); 4075 auto Bcie = Bcsr->column_indices->end(); 4076 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4077 #endif 4078 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4079 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4080 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4081 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4082 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4083 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4084 auto p1 = Ccusp->cooPerm->begin(); 4085 auto p2 = Ccusp->cooPerm->begin(); 4086 thrust::advance(p2,Annz); 4087 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4088 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4089 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4090 #endif 4091 auto cci = thrust::make_counting_iterator(zero); 4092 auto cce = thrust::make_counting_iterator(c->nz); 4093 #if 0 //Errors on SUMMIT cuda 11.1.0 4094 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4095 #else 4096 auto pred = thrust::identity<int>(); 4097 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4098 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4099 #endif 4100 stat = cusparseXcoo2csr(Ccusp->handle, 4101 Ccoo->data().get(), 4102 c->nz, 4103 m, 4104 Ccsr->row_offsets->data().get(), 4105 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4106 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4107 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4108 delete wPerm; 4109 delete Acoo; 4110 delete Bcoo; 4111 delete Ccoo; 4112 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4113 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4114 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4115 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4116 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4117 #endif 4118 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4119 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4120 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4121 CsrMatrix *CcsrT = new CsrMatrix; 4122 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4123 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4124 4125 (*C)->form_explicit_transpose = PETSC_TRUE; 4126 (*C)->transupdated = PETSC_TRUE; 4127 Ccusp->rowoffsets_gpu = NULL; 4128 CmatT->cprowIndices = NULL; 4129 CmatT->mat = CcsrT; 4130 CcsrT->num_rows = n; 4131 CcsrT->num_cols = m; 4132 CcsrT->num_entries = c->nz; 4133 4134 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4135 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4136 CcsrT->values = new THRUSTARRAY(c->nz); 4137 4138 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4139 auto rT = CcsrT->row_offsets->begin(); 4140 if (AT) { 4141 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4142 thrust::advance(rT,-1); 4143 } 4144 if (BT) { 4145 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4146 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4147 thrust::copy(titb,tite,rT); 4148 } 4149 auto cT = CcsrT->column_indices->begin(); 4150 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4151 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4152 auto vT = CcsrT->values->begin(); 4153 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4154 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4155 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4156 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4157 4158 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4159 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4160 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4161 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4162 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4163 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4164 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4165 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4166 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4167 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4168 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4169 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4170 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4171 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4172 #endif 4173 Ccusp->matTranspose = CmatT; 4174 } 4175 } 4176 4177 c->singlemalloc = PETSC_FALSE; 4178 c->free_a = PETSC_TRUE; 4179 c->free_ij = PETSC_TRUE; 4180 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4181 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4182 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4183 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4184 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4185 ii = *Ccsr->row_offsets; 4186 jj = *Ccsr->column_indices; 4187 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4188 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4189 } else { 4190 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4191 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4192 } 4193 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4194 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4195 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4196 c->maxnz = c->nz; 4197 c->nonzerorowcnt = 0; 4198 c->rmax = 0; 4199 for (i = 0; i < m; i++) { 4200 const PetscInt nn = c->i[i+1] - c->i[i]; 4201 c->ilen[i] = c->imax[i] = nn; 4202 c->nonzerorowcnt += (PetscInt)!!nn; 4203 c->rmax = PetscMax(c->rmax,nn); 4204 } 4205 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4206 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4207 (*C)->nonzerostate++; 4208 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4209 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4210 Ccusp->nonzerostate = (*C)->nonzerostate; 4211 (*C)->preallocated = PETSC_TRUE; 4212 } else { 4213 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4214 c = (Mat_SeqAIJ*)(*C)->data; 4215 if (c->nz) { 4216 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4217 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4218 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4219 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4220 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4221 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4222 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4223 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4224 Acsr = (CsrMatrix*)Acusp->mat->mat; 4225 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4226 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4227 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4228 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4229 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4230 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4231 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4232 auto pmid = Ccusp->cooPerm->begin(); 4233 thrust::advance(pmid,Acsr->num_entries); 4234 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4235 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4236 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4237 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4238 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4239 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4240 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4241 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4242 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4243 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4244 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4245 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4246 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4247 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4248 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4249 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4250 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4251 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4252 auto vT = CcsrT->values->begin(); 4253 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4254 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4255 (*C)->transupdated = PETSC_TRUE; 4256 } 4257 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4258 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4259 } 4260 } 4261 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4262 (*C)->assembled = PETSC_TRUE; 4263 (*C)->was_assembled = PETSC_FALSE; 4264 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4265 PetscFunctionReturn(0); 4266 } 4267 4268 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4269 { 4270 PetscErrorCode ierr; 4271 bool dmem; 4272 const PetscScalar *av; 4273 cudaError_t cerr; 4274 4275 PetscFunctionBegin; 4276 dmem = isCudaMem(v); 4277 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4278 if (n && idx) { 4279 THRUSTINTARRAY widx(n); 4280 widx.assign(idx,idx+n); 4281 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4282 4283 THRUSTARRAY *w = NULL; 4284 thrust::device_ptr<PetscScalar> dv; 4285 if (dmem) { 4286 dv = thrust::device_pointer_cast(v); 4287 } else { 4288 w = new THRUSTARRAY(n); 4289 dv = w->data(); 4290 } 4291 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4292 4293 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4294 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4295 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4296 if (w) { 4297 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4298 } 4299 delete w; 4300 } else { 4301 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4302 } 4303 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4304 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4305 PetscFunctionReturn(0); 4306 } 4307 4308 /* 4309 LU BAND factorization with optimization for block diagonal (Nf blocks) in natural order (-mat_no_inode -pc_factor_mat_ordering_type rcm with Nf>1 fields) 4310 4311 requires: 4312 structurally symmetric: fix with transpose/column meta data 4313 */ 4314 4315 /* 4316 The GPU LU factor kernel 4317 */ 4318 __global__ 4319 void __launch_bounds__(1024,1) 4320 mat_lu_factor_band_init_set_i(const PetscInt n, const int bw, int bi_csr[]) 4321 { 4322 const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf; 4323 const PetscInt field = blockIdx.x, blkIdx = blockIdx.y; 4324 const PetscInt nloc_i = (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 4325 4326 // set i (row+1) 4327 if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) bi_csr[0] = 0; // dummy at zero 4328 // for (int rowb = start_i + blkIdx*blockDim.y + threadIdx.y; rowb < end_i; rowb += Nblk*blockDim.y) { // rows in block 4329 for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y 4330 if (rowb < end_i && threadIdx.x==0) { 4331 PetscInt i=rowb+1, ni = (rowb>bw) ? bw+1 : i, n1L = ni*(ni-1)/2, nug= i*bw, n2L = bw*((rowb>bw) ? (rowb-bw) : 0), mi = bw + rowb + 1 - n, clip = (mi>0) ? mi*(mi-1)/2 + mi: 0; 4332 bi_csr[rowb+1] = n1L + nug - clip + n2L + i; 4333 } 4334 } 4335 } 4336 // copy AIJ to AIJ_BAND 4337 __global__ 4338 void __launch_bounds__(1024,1) 4339 mat_lu_factor_band_copy_aij_aij(const PetscInt n, const int bw, const PetscInt r[], const PetscInt ic[], 4340 const int ai_d[], const int aj_d[], const PetscScalar aa_d[], 4341 const int bi_csr[], PetscScalar ba_csr[]) 4342 { 4343 const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf; 4344 const PetscInt field = blockIdx.x, blkIdx = blockIdx.y; 4345 const PetscInt nloc_i = (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 4346 4347 // zero B 4348 if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) ba_csr[bi_csr[n]] = 0; // flop count at end 4349 for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y 4350 if (rowb < end_i) { 4351 PetscScalar *batmp = ba_csr + bi_csr[rowb]; 4352 const PetscInt nzb = bi_csr[rowb+1] - bi_csr[rowb]; 4353 for (int j=threadIdx.x ; j<nzb ; j += blockDim.x) { 4354 if (j<nzb) { 4355 batmp[j] = 0; 4356 } 4357 } 4358 } 4359 } 4360 4361 // copy A into B with CSR format -- these two loops can be fused 4362 for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y 4363 if (rowb < end_i) { 4364 const PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 4365 const int *ajtmp = aj_d + ai_d[rowa], bjStart = (rowb>bw) ? rowb-bw : 0; 4366 const PetscScalar *av = aa_d + ai_d[rowa]; 4367 PetscScalar *batmp = ba_csr + bi_csr[rowb]; 4368 /* load in initial (unfactored row) */ 4369 for (int j=threadIdx.x ; j<nza ; j += blockDim.x) { 4370 if (j<nza) { 4371 PetscInt colb = ic[ajtmp[j]], idx = colb - bjStart; 4372 PetscScalar vala = av[j]; 4373 batmp[idx] = vala; 4374 } 4375 } 4376 } 4377 } 4378 } 4379 // print AIJ_BAND 4380 __global__ 4381 void print_mat_aij_band(const PetscInt n, const int bi_csr[], const PetscScalar ba_csr[]) 4382 { 4383 // debug 4384 if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0){ 4385 printf("B (AIJ) n=%d:\n",(int)n); 4386 for (int rowb=0;rowb<n;rowb++) { 4387 const PetscInt nz = bi_csr[rowb+1] - bi_csr[rowb]; 4388 const PetscScalar *batmp = ba_csr + bi_csr[rowb]; 4389 for (int j=0; j<nz; j++) printf("(%13.6e) ",PetscRealPart(batmp[j])); 4390 printf(" bi=%d\n",bi_csr[rowb+1]); 4391 } 4392 } 4393 } 4394 // Band LU kernel --- ba_csr bi_csr 4395 __global__ 4396 void __launch_bounds__(1024,1) 4397 mat_lu_factor_band(const PetscInt n, const PetscInt bw, const int bi_csr[], PetscScalar ba_csr[]) 4398 { 4399 extern __shared__ PetscInt smemInt[]; 4400 PetscInt *sm_pkIdx = &smemInt[0]; 4401 const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf; 4402 const PetscInt field = blockIdx.x, blkIdx = blockIdx.y; 4403 const PetscInt start = field*nloc, end = start + nloc; 4404 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4405 auto g = cooperative_groups::this_grid(); 4406 #endif 4407 // A22 panel update for each row A(1,:) and col A(:,1) 4408 for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) { 4409 PetscInt tnzUd = bw, maxU = end-1 - glbDD; // we are chopping off the inter ears 4410 const PetscInt nzUd = (tnzUd>maxU) ? maxU : tnzUd, dOffset = (glbDD > bw) ? bw : glbDD; // global to go past ears after first 4411 const PetscInt nzUd_pad = blockDim.y*(nzUd/blockDim.y + !!(nzUd%blockDim.y)); 4412 PetscScalar *pBdd = ba_csr + bi_csr[glbDD] + dOffset; 4413 const PetscScalar *baUd = pBdd + 1; // vector of data U(i,i+1:end) 4414 const PetscScalar Bdd = *pBdd; 4415 const PetscInt offset = blkIdx*blockDim.y + threadIdx.y, inc = Nblk*blockDim.y; 4416 for (int idx = offset, myi = glbDD + offset + 1; idx < nzUd_pad ; idx += inc, myi += inc) { /* assuming symmetric structure */ 4417 if (idx < nzUd && threadIdx.x==0) { /* assuming symmetric structure */ 4418 const PetscInt bwi = myi > bw ? bw : myi, kIdx = bwi - (myi-glbDD); // cuts off just the first (global) block 4419 PetscScalar *Aid = ba_csr + bi_csr[myi] + kIdx; 4420 *Aid = *Aid/Bdd; 4421 sm_pkIdx[threadIdx.y] = kIdx; 4422 } 4423 __syncthreads(); // synch on threadIdx.x only 4424 if (idx < nzUd) { /* assuming symmetric structure */ 4425 PetscInt kIdx = sm_pkIdx[threadIdx.y]; 4426 PetscScalar *Aid = ba_csr + bi_csr[myi] + kIdx; 4427 PetscScalar *Aij = Aid + 1; 4428 PetscScalar Lid = *Aid; 4429 for (int jIdx=threadIdx.x ; jIdx<nzUd ; jIdx += blockDim.x) { 4430 if (jIdx<nzUd) { 4431 Aij[jIdx] -= Lid*baUd[jIdx]; 4432 } 4433 } 4434 } 4435 } 4436 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4437 g.sync(); 4438 #else 4439 __syncthreads(); 4440 #endif 4441 } /* endof for (i=0; i<n; i++) { */ 4442 } 4443 4444 static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat,Vec,Vec); 4445 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat B,Mat A,const MatFactorInfo *info) 4446 { 4447 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 4448 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 4449 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 4450 Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr; 4451 Mat_SeqAIJCUSPARSEMultStruct *matstructA; 4452 CsrMatrix *matrixA; 4453 PetscErrorCode ierr; 4454 cudaError_t cerr; 4455 const PetscInt n=A->rmap->n, *ic, *r; 4456 const int *ai_d, *aj_d; 4457 const PetscScalar *aa_d; 4458 PetscScalar *ba_t = cusparseTriFactors->a_band_d; 4459 int *bi_t = cusparseTriFactors->i_band_d; 4460 PetscContainer container; 4461 int Ni = 10, team_size=9, Nf, nVec=56, nconcurrent = 1, nsm = -1; 4462 4463 PetscFunctionBegin; 4464 if (A->rmap->n == 0) { 4465 PetscFunctionReturn(0); 4466 } 4467 // cusparse setup 4468 if (!cusparsestructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparsestructA"); 4469 matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; // matstruct->cprowIndices 4470 if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct"); 4471 matrixA = (CsrMatrix*)matstructA->mat; 4472 if (!matrixA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matrix cusparsestructA->mat->mat"); 4473 4474 // factor: get Nf if available 4475 ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 4476 if (container) { 4477 PetscInt *pNf=NULL; 4478 ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 4479 Nf = (*pNf)%1000; 4480 if ((*pNf)/1000>0) nconcurrent = (*pNf)/1000; // number of SMs to use 4481 } else Nf = 1; 4482 if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 4483 4484 // get data 4485 ic = thrust::raw_pointer_cast(cusparseTriFactors->cpermIndices->data()); 4486 ai_d = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 4487 aj_d = thrust::raw_pointer_cast(matrixA->column_indices->data()); 4488 aa_d = thrust::raw_pointer_cast(matrixA->values->data().get()); 4489 r = thrust::raw_pointer_cast(cusparseTriFactors->rpermIndices->data()); 4490 4491 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4492 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4493 { 4494 int bw = (2*n-1 - (int)(PetscSqrtReal(1+4*(n*n-b->nz))+PETSC_MACHINE_EPSILON))/2, bm1=bw-1,nl=n/Nf; 4495 int gpuid; 4496 cudaDeviceProp prop; 4497 cudaGetDevice(&gpuid); 4498 cudaGetDeviceProperties(&prop, gpuid); 4499 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 4500 Ni = 1/nconcurrent; 4501 Ni = 1; 4502 #else 4503 nsm = prop.multiProcessorCount; 4504 Ni = nsm/Nf/nconcurrent; 4505 #endif 4506 team_size = bw/Ni + !!(bw%Ni); 4507 nVec = PetscMin(bw, 1024/team_size); 4508 ierr = PetscInfo5(A,"Matrix Bandwidth = %d, number SMs/block = %d, num concurency = %d, num fields = %d, numSMs/GPU = %d\n",bw,Ni,nconcurrent,Nf,nsm);CHKERRQ(ierr); 4509 { 4510 dim3 dimBlockTeam(nVec,team_size); 4511 dim3 dimBlockLeague(Nf,Ni); 4512 mat_lu_factor_band_copy_aij_aij<<<dimBlockLeague,dimBlockTeam>>>(n, bw, r, ic, ai_d, aj_d, aa_d, bi_t, ba_t); 4513 CHECK_LAUNCH_ERROR(); // does a sync 4514 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4515 void *kernelArgs[] = { (void*)&n, (void*)&bw, (void*)&bi_t, (void*)&ba_t}; 4516 cudaLaunchCooperativeKernel((void*)mat_lu_factor_band, dimBlockLeague, dimBlockTeam, kernelArgs, team_size*sizeof(PetscInt), NULL); 4517 #else 4518 mat_lu_factor_band<<<dimBlockLeague,dimBlockTeam,team_size*sizeof(PetscInt)>>>(n, bw, bi_t, ba_t); 4519 #endif 4520 CHECK_LAUNCH_ERROR(); // does a sync 4521 #if defined(PETSC_USE_LOG) 4522 ierr = PetscLogGpuFlops((PetscLogDouble)Nf*(bm1*(bm1 + 1)*(2*bm1 + 1)/3 + 2*(nl-bw)*bw*bw + nl*(nl+1)/2));CHKERRQ(ierr); 4523 #endif 4524 } 4525 } 4526 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4527 4528 /* determine which version of MatSolve needs to be used. from MatLUFactorNumeric_AIJ_SeqAIJCUSPARSE */ 4529 B->ops->solve = MatSolve_SeqAIJCUSPARSEBAND; 4530 B->ops->solvetranspose = NULL; // need transpose 4531 B->ops->matsolve = NULL; 4532 B->ops->matsolvetranspose = NULL; 4533 4534 PetscFunctionReturn(0); 4535 } 4536 4537 static PetscErrorCode MatrixNfDestroy(void *ptr) 4538 { 4539 PetscInt *nf = (PetscInt *)ptr; 4540 PetscErrorCode ierr; 4541 PetscFunctionBegin; 4542 ierr = PetscFree(nf);CHKERRQ(ierr); 4543 PetscFunctionReturn(0); 4544 } 4545 4546 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 4547 { 4548 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b; 4549 IS isicol; 4550 PetscErrorCode ierr; 4551 cudaError_t cerr; 4552 const PetscInt *ic,*ai=a->i,*aj=a->j; 4553 PetscScalar *ba_t; 4554 int *bi_t; 4555 PetscInt i,n=A->rmap->n,Nf; 4556 PetscInt nzBcsr,bwL,bwU; 4557 PetscBool missing; 4558 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 4559 PetscContainer container; 4560 4561 PetscFunctionBegin; 4562 if (A->rmap->N != A->cmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"matrix must be square"); 4563 ierr = MatMissingDiagonal(A,&missing,&i);CHKERRQ(ierr); 4564 if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",i); 4565 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"!cusparseTriFactors"); 4566 ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&missing);CHKERRQ(ierr); 4567 if (!missing) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"only structrally symmetric matrices supported"); 4568 4569 // factor: get Nf if available 4570 ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 4571 if (container) { 4572 PetscInt *pNf=NULL; 4573 ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 4574 Nf = (*pNf)%1000; 4575 ierr = PetscContainerCreate(PETSC_COMM_SELF, &container);CHKERRQ(ierr); 4576 ierr = PetscMalloc(sizeof(PetscInt), &pNf);CHKERRQ(ierr); 4577 *pNf = Nf; 4578 ierr = PetscContainerSetPointer(container, (void *)pNf);CHKERRQ(ierr); 4579 ierr = PetscContainerSetUserDestroy(container, MatrixNfDestroy);CHKERRQ(ierr); 4580 ierr = PetscObjectCompose((PetscObject)B, "Nf", (PetscObject) container);CHKERRQ(ierr); 4581 ierr = PetscContainerDestroy(&container);CHKERRQ(ierr); 4582 } else Nf = 1; 4583 if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 4584 4585 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 4586 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 4587 4588 ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 4589 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)isicol);CHKERRQ(ierr); 4590 b = (Mat_SeqAIJ*)(B)->data; 4591 4592 /* get band widths, MatComputeBandwidth should take a reordering ic and do this */ 4593 bwL = bwU = 0; 4594 for (int rwb=0; rwb<n; rwb++) { 4595 const PetscInt rwa = ic[rwb], anz = ai[rwb+1] - ai[rwb], *ajtmp = aj + ai[rwb]; 4596 for (int j=0;j<anz;j++) { 4597 PetscInt colb = ic[ajtmp[j]]; 4598 if (colb<rwa) { // L 4599 if (rwa-colb > bwL) bwL = rwa-colb; 4600 } else { 4601 if (colb-rwa > bwU) bwU = colb-rwa; 4602 } 4603 } 4604 } 4605 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 4606 /* only support structurally symmetric, but it might work */ 4607 if (bwL!=bwU) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Only symmetric structure supported (now) W_L=%D W_U=%D",bwL,bwU); 4608 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 4609 nzBcsr = n + (2*n-1)*bwU - bwU*bwU; 4610 b->maxnz = b->nz = nzBcsr; 4611 cusparseTriFactors->nnz = b->nz; // only meta data needed: n & nz 4612 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 4613 cerr = cudaMalloc(&ba_t,(b->nz+1)*sizeof(PetscScalar));CHKERRCUDA(cerr); // incude a place for flops 4614 cerr = cudaMalloc(&bi_t,(n+1)*sizeof(int));CHKERRCUDA(cerr); 4615 cusparseTriFactors->a_band_d = ba_t; 4616 cusparseTriFactors->i_band_d = bi_t; 4617 /* In b structure: Free imax, ilen, old a, old j. Allocate solve_work, new a, new j */ 4618 ierr = PetscLogObjectMemory((PetscObject)B,(nzBcsr+1)*(sizeof(PetscInt)+sizeof(PetscScalar)));CHKERRQ(ierr); 4619 { 4620 dim3 dimBlockTeam(1,128); 4621 dim3 dimBlockLeague(Nf,1); 4622 mat_lu_factor_band_init_set_i<<<dimBlockLeague,dimBlockTeam>>>(n, bwU, bi_t); 4623 } 4624 CHECK_LAUNCH_ERROR(); // does a sync 4625 4626 // setup data 4627 if (!cusparseTriFactors->rpermIndices) { 4628 const PetscInt *r; 4629 4630 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 4631 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 4632 cusparseTriFactors->rpermIndices->assign(r, r+n); 4633 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 4634 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4635 } 4636 /* upper triangular indices */ 4637 if (!cusparseTriFactors->cpermIndices) { 4638 const PetscInt *c; 4639 4640 ierr = ISGetIndices(isicol,&c);CHKERRQ(ierr); 4641 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 4642 cusparseTriFactors->cpermIndices->assign(c, c+n); 4643 ierr = ISRestoreIndices(isicol,&c);CHKERRQ(ierr); 4644 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4645 } 4646 4647 /* put together the new matrix */ 4648 b->free_a = PETSC_FALSE; 4649 b->free_ij = PETSC_FALSE; 4650 b->singlemalloc = PETSC_FALSE; 4651 b->ilen = NULL; 4652 b->imax = NULL; 4653 b->row = isrow; 4654 b->col = iscol; 4655 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 4656 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 4657 b->icol = isicol; 4658 ierr = PetscMalloc1(n+1,&b->solve_work);CHKERRQ(ierr); 4659 4660 B->factortype = MAT_FACTOR_LU; 4661 B->info.factor_mallocs = 0; 4662 B->info.fill_ratio_given = 0; 4663 4664 if (ai[n]) { 4665 B->info.fill_ratio_needed = ((PetscReal)(nzBcsr))/((PetscReal)ai[n]); 4666 } else { 4667 B->info.fill_ratio_needed = 0.0; 4668 } 4669 #if defined(PETSC_USE_INFO) 4670 if (ai[n] != 0) { 4671 PetscReal af = B->info.fill_ratio_needed; 4672 ierr = PetscInfo1(A,"Band fill ratio %g\n",(double)af);CHKERRQ(ierr); 4673 } else { 4674 ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr); 4675 } 4676 #endif 4677 if (a->inode.size) { 4678 ierr = PetscInfo(A,"Warning: using inodes in band solver.\n");CHKERRQ(ierr); 4679 } 4680 ierr = MatSeqAIJCheckInode_FactorLU(B);CHKERRQ(ierr); 4681 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSEBAND; 4682 B->offloadmask = PETSC_OFFLOAD_GPU; 4683 4684 PetscFunctionReturn(0); 4685 } 4686 4687 /* Use -pc_factor_mat_solver_type cusparseband */ 4688 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse_band(Mat A,MatSolverType *type) 4689 { 4690 PetscFunctionBegin; 4691 *type = MATSOLVERCUSPARSEBAND; 4692 PetscFunctionReturn(0); 4693 } 4694 4695 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat A,MatFactorType ftype,Mat *B) 4696 { 4697 PetscErrorCode ierr; 4698 PetscInt n = A->rmap->n; 4699 4700 PetscFunctionBegin; 4701 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 4702 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 4703 (*B)->factortype = ftype; 4704 (*B)->useordering = PETSC_TRUE; 4705 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 4706 4707 if (ftype == MAT_FACTOR_LU) { 4708 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 4709 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 4710 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSEBAND; 4711 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSEBAND Matrix Types"); 4712 4713 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 4714 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse_band);CHKERRQ(ierr); 4715 PetscFunctionReturn(0); 4716 } 4717 4718 #define WARP_SIZE 32 4719 template <typename T> 4720 __forceinline__ __device__ 4721 T wreduce(T a) 4722 { 4723 T b; 4724 #pragma unroll 4725 for (int i = WARP_SIZE/2; i >= 1; i = i >> 1) { 4726 b = __shfl_down_sync(0xffffffff, a, i); 4727 a += b; 4728 } 4729 return a; 4730 } 4731 // reduce in a block, returns result in thread 0 4732 template <typename T, int BLOCK_SIZE> 4733 __device__ 4734 T breduce(T a) 4735 { 4736 constexpr int NWARP = BLOCK_SIZE/WARP_SIZE; 4737 __shared__ double buf[NWARP]; 4738 int wid = threadIdx.x / WARP_SIZE; 4739 int laneid = threadIdx.x % WARP_SIZE; 4740 T b = wreduce<T>(a); 4741 if (laneid == 0) 4742 buf[wid] = b; 4743 __syncthreads(); 4744 if (wid == 0) { 4745 if (threadIdx.x < NWARP) 4746 a = buf[threadIdx.x]; 4747 else 4748 a = 0; 4749 for (int i = (NWARP+1)/2; i >= 1; i = i >> 1) { 4750 a += __shfl_down_sync(0xffffffff, a, i); 4751 } 4752 } 4753 return a; 4754 } 4755 4756 4757 // Band LU kernel --- ba_csr bi_csr 4758 template <int BLOCK_SIZE> 4759 __global__ 4760 void __launch_bounds__(256,1) 4761 mat_solve_band(const PetscInt n, const PetscInt bw, const PetscScalar ba_csr[], PetscScalar x[]) 4762 { 4763 const PetscInt Nf = gridDim.x, nloc = n/Nf, field = blockIdx.x, start = field*nloc, end = start + nloc, chopnz = bw*(bw+1)/2, blocknz=(2*bw+1)*nloc, blocknz_0 = blocknz-chopnz; 4764 const PetscScalar *pLi; 4765 const int tid = threadIdx.x; 4766 4767 /* Next, solve L */ 4768 pLi = ba_csr + (field==0 ? 0 : blocknz_0 + (field-1)*blocknz + bw); // diagonal (0,0) in field 4769 for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) { 4770 const PetscInt col = locDD<bw ? start : (glbDD-bw); 4771 PetscScalar t = 0; 4772 for (int j=col+tid,idx=tid;j<glbDD;j+=blockDim.x,idx+=blockDim.x) { 4773 t += pLi[idx]*x[j]; 4774 } 4775 #if defined(PETSC_USE_COMPLEX) 4776 PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t); 4777 PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti)); 4778 t = tt; 4779 #else 4780 t = breduce<PetscReal,BLOCK_SIZE>(t); 4781 #endif 4782 if (threadIdx.x == 0) 4783 x[glbDD] -= t; // /1.0 4784 __syncthreads(); 4785 // inc 4786 pLi += glbDD-col; // get to diagonal 4787 if (glbDD > n-1-bw) pLi += n-1-glbDD; // skip over U, only last block has funny offset 4788 else pLi += bw; 4789 pLi += 1; // skip to next row 4790 if (field>0 && (locDD+1)<bw) pLi += bw-(locDD+1); // skip padding at beginning (ear) 4791 } 4792 /* Then, solve U */ 4793 pLi = ba_csr + Nf*blocknz - 2*chopnz - 1; // end of real data on block (diagonal) 4794 if (field != Nf-1) pLi -= blocknz_0 + (Nf-2-field)*blocknz + bw; // diagonal of last local row 4795 for (int glbDD=end-1, locDD = 0; glbDD >= start; glbDD--, locDD++) { 4796 const PetscInt col = (locDD<bw) ? end-1 : glbDD+bw; // end of row in U 4797 PetscScalar t = 0; 4798 for (int j=col-tid,idx=tid;j>glbDD;j-=blockDim.x,idx+=blockDim.x) { 4799 t += pLi[-idx]*x[j]; 4800 } 4801 #if defined(PETSC_USE_COMPLEX) 4802 PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t); 4803 PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti)); 4804 t = tt; 4805 #else 4806 t = breduce<PetscReal,BLOCK_SIZE>(PetscRealPart(t)); 4807 #endif 4808 pLi -= col-glbDD; // diagonal 4809 if (threadIdx.x == 0) { 4810 x[glbDD] -= t; 4811 x[glbDD] /= pLi[0]; 4812 } 4813 __syncthreads(); 4814 // inc past L to start of previous U 4815 pLi -= bw+1; 4816 if (glbDD<bw) pLi += bw-glbDD; // overshot in top left corner 4817 if (((locDD+1) < bw) && field != Nf-1) pLi -= (bw - (locDD+1)); // skip past right corner 4818 } 4819 } 4820 4821 static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat A,Vec bb,Vec xx) 4822 { 4823 const PetscScalar *barray; 4824 PetscScalar *xarray; 4825 thrust::device_ptr<const PetscScalar> bGPU; 4826 thrust::device_ptr<PetscScalar> xGPU; 4827 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 4828 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 4829 PetscInt n=A->rmap->n, nz=cusparseTriFactors->nnz, bw=(2*n-1 - (int)(PetscSqrtReal(1+4*(n*n-nz))+PETSC_MACHINE_EPSILON))/2, Nf; 4830 PetscErrorCode ierr; 4831 cudaError_t cerr; 4832 PetscContainer container; 4833 4834 PetscFunctionBegin; 4835 if (A->rmap->n == 0) { 4836 PetscFunctionReturn(0); 4837 } 4838 // factor: get Nf if available 4839 ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 4840 if (container) { 4841 PetscInt *pNf=NULL; 4842 ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 4843 Nf = (*pNf)%1000; 4844 } else Nf = 1; 4845 if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 4846 4847 /* Get the GPU pointers */ 4848 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 4849 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 4850 xGPU = thrust::device_pointer_cast(xarray); 4851 bGPU = thrust::device_pointer_cast(barray); 4852 4853 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4854 /* First, reorder with the row permutation */ 4855 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 4856 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 4857 tempGPU->begin()); 4858 constexpr int block = 128; 4859 mat_solve_band<block><<<Nf,block>>>(n,bw,cusparseTriFactors->a_band_d,tempGPU->data().get()); 4860 CHECK_LAUNCH_ERROR(); // does a sync 4861 4862 /* Last, reorder with the column permutation */ 4863 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 4864 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 4865 xGPU); 4866 4867 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 4868 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 4869 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4870 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4871 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 4872 PetscFunctionReturn(0); 4873 } 4874