1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 68 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 69 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 70 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 71 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 72 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 73 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 74 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 75 76 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 82 83 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 84 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 85 86 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 87 { 88 cusparseStatus_t stat; 89 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 90 91 PetscFunctionBegin; 92 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 93 cusparsestruct->stream = stream; 94 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 95 PetscFunctionReturn(0); 96 } 97 98 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 99 { 100 cusparseStatus_t stat; 101 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 102 103 PetscFunctionBegin; 104 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 105 if (cusparsestruct->handle != handle) { 106 if (cusparsestruct->handle) { 107 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 108 } 109 cusparsestruct->handle = handle; 110 } 111 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 112 PetscFunctionReturn(0); 113 } 114 115 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 116 { 117 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 118 PetscBool flg; 119 PetscErrorCode ierr; 120 121 PetscFunctionBegin; 122 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 123 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 124 if (cusparsestruct->handle) cusparsestruct->handle = 0; 125 PetscFunctionReturn(0); 126 } 127 128 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 129 { 130 PetscFunctionBegin; 131 *type = MATSOLVERCUSPARSE; 132 PetscFunctionReturn(0); 133 } 134 135 /*MC 136 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 137 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 138 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 139 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 140 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 141 algorithms are not recommended. This class does NOT support direct solver operations. 142 143 Level: beginner 144 145 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 146 M*/ 147 148 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 149 { 150 PetscErrorCode ierr; 151 PetscInt n = A->rmap->n; 152 153 PetscFunctionBegin; 154 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 155 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 156 (*B)->factortype = ftype; 157 (*B)->useordering = PETSC_TRUE; 158 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 159 160 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 161 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 162 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 163 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 164 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 165 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 166 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 167 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 168 169 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 170 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 171 PetscFunctionReturn(0); 172 } 173 174 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 175 { 176 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 177 178 PetscFunctionBegin; 179 switch (op) { 180 case MAT_CUSPARSE_MULT: 181 cusparsestruct->format = format; 182 break; 183 case MAT_CUSPARSE_ALL: 184 cusparsestruct->format = format; 185 break; 186 default: 187 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 188 } 189 PetscFunctionReturn(0); 190 } 191 192 /*@ 193 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 194 operation. Only the MatMult operation can use different GPU storage formats 195 for MPIAIJCUSPARSE matrices. 196 Not Collective 197 198 Input Parameters: 199 + A - Matrix of type SEQAIJCUSPARSE 200 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 201 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 202 203 Output Parameter: 204 205 Level: intermediate 206 207 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 208 @*/ 209 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 210 { 211 PetscErrorCode ierr; 212 213 PetscFunctionBegin; 214 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 215 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 216 PetscFunctionReturn(0); 217 } 218 219 /*@ 220 MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose 221 222 Collective on mat 223 224 Input Parameters: 225 + A - Matrix of type SEQAIJCUSPARSE 226 - transgen - the boolean flag 227 228 Level: intermediate 229 230 .seealso: MATSEQAIJCUSPARSE 231 @*/ 232 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 233 { 234 PetscErrorCode ierr; 235 PetscBool flg; 236 237 PetscFunctionBegin; 238 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 239 ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 240 if (flg) { 241 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 242 243 if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 244 cusp->transgen = transgen; 245 if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 246 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 247 } 248 } 249 PetscFunctionReturn(0); 250 } 251 252 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 253 { 254 PetscErrorCode ierr; 255 MatCUSPARSEStorageFormat format; 256 PetscBool flg; 257 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 258 259 PetscFunctionBegin; 260 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 261 if (A->factortype == MAT_FACTOR_NONE) { 262 PetscBool transgen = cusparsestruct->transgen; 263 264 ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 265 if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);} 266 267 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 268 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 269 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 270 271 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 272 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 273 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 274 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 275 cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 276 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 277 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 278 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 279 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 280 281 cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 282 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 283 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 284 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 285 286 cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 287 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 288 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 289 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 290 #endif 291 } 292 ierr = PetscOptionsTail();CHKERRQ(ierr); 293 PetscFunctionReturn(0); 294 } 295 296 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 297 { 298 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 299 PetscErrorCode ierr; 300 301 PetscFunctionBegin; 302 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 303 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 304 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 305 PetscFunctionReturn(0); 306 } 307 308 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 309 { 310 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 311 PetscErrorCode ierr; 312 313 PetscFunctionBegin; 314 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 315 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 316 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 317 PetscFunctionReturn(0); 318 } 319 320 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 321 { 322 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 323 PetscErrorCode ierr; 324 325 PetscFunctionBegin; 326 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 327 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 328 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 329 PetscFunctionReturn(0); 330 } 331 332 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 333 { 334 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 335 PetscErrorCode ierr; 336 337 PetscFunctionBegin; 338 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 339 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 340 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 341 PetscFunctionReturn(0); 342 } 343 344 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 345 { 346 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 347 PetscInt n = A->rmap->n; 348 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 349 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 350 cusparseStatus_t stat; 351 const PetscInt *ai = a->i,*aj = a->j,*vi; 352 const MatScalar *aa = a->a,*v; 353 PetscInt *AiLo, *AjLo; 354 PetscScalar *AALo; 355 PetscInt i,nz, nzLower, offset, rowOffset; 356 PetscErrorCode ierr; 357 cudaError_t cerr; 358 359 PetscFunctionBegin; 360 if (!n) PetscFunctionReturn(0); 361 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 362 try { 363 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 364 nzLower=n+ai[n]-ai[1]; 365 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 366 if (!loTriFactor) { 367 368 /* Allocate Space for the lower triangular matrix */ 369 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 370 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 371 372 /* Fill the lower triangular matrix */ 373 AiLo[0] = (PetscInt) 0; 374 AiLo[n] = nzLower; 375 AjLo[0] = (PetscInt) 0; 376 AALo[0] = (MatScalar) 1.0; 377 v = aa; 378 vi = aj; 379 offset = 1; 380 rowOffset= 1; 381 for (i=1; i<n; i++) { 382 nz = ai[i+1] - ai[i]; 383 /* additional 1 for the term on the diagonal */ 384 AiLo[i] = rowOffset; 385 rowOffset += nz+1; 386 387 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 388 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 389 390 offset += nz; 391 AjLo[offset] = (PetscInt) i; 392 AALo[offset] = (MatScalar) 1.0; 393 offset += 1; 394 395 v += nz; 396 vi += nz; 397 } 398 399 /* allocate space for the triangular factor information */ 400 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 401 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 402 /* Create the matrix description */ 403 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 404 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 405 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 406 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 407 #else 408 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 409 #endif 410 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 411 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 412 413 /* set the operation */ 414 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 415 416 /* set the matrix */ 417 loTriFactor->csrMat = new CsrMatrix; 418 loTriFactor->csrMat->num_rows = n; 419 loTriFactor->csrMat->num_cols = n; 420 loTriFactor->csrMat->num_entries = nzLower; 421 422 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 423 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 424 425 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 426 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 427 428 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 429 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 430 431 /* Create the solve analysis information */ 432 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 433 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 434 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 435 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 436 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 437 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 438 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 439 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 440 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 441 #endif 442 443 /* perform the solve analysis */ 444 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 445 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 446 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 447 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 448 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 449 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 450 #endif 451 );CHKERRCUSPARSE(stat); 452 cerr = WaitForCUDA();CHKERRCUDA(cerr); 453 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 454 455 /* assign the pointer */ 456 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 457 458 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 459 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 460 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 461 } else { /* update values only */ 462 /* Fill the lower triangular matrix */ 463 AALo[0] = 1.0; 464 v = aa; 465 vi = aj; 466 offset = 1; 467 for (i=1; i<n; i++) { 468 nz = ai[i+1] - ai[i]; 469 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 470 offset += nz; 471 AALo[offset] = 1.0; 472 offset += 1; 473 v += nz; 474 } 475 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 476 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 477 } 478 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 479 } catch(char *ex) { 480 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 481 } 482 } 483 PetscFunctionReturn(0); 484 } 485 486 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 487 { 488 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 489 PetscInt n = A->rmap->n; 490 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 491 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 492 cusparseStatus_t stat; 493 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 494 const MatScalar *aa = a->a,*v; 495 PetscInt *AiUp, *AjUp; 496 PetscScalar *AAUp; 497 PetscInt i,nz, nzUpper, offset; 498 PetscErrorCode ierr; 499 cudaError_t cerr; 500 501 PetscFunctionBegin; 502 if (!n) PetscFunctionReturn(0); 503 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 504 try { 505 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 506 nzUpper = adiag[0]-adiag[n]; 507 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 508 if (!upTriFactor) { 509 /* Allocate Space for the upper triangular matrix */ 510 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 511 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 512 513 /* Fill the upper triangular matrix */ 514 AiUp[0]=(PetscInt) 0; 515 AiUp[n]=nzUpper; 516 offset = nzUpper; 517 for (i=n-1; i>=0; i--) { 518 v = aa + adiag[i+1] + 1; 519 vi = aj + adiag[i+1] + 1; 520 521 /* number of elements NOT on the diagonal */ 522 nz = adiag[i] - adiag[i+1]-1; 523 524 /* decrement the offset */ 525 offset -= (nz+1); 526 527 /* first, set the diagonal elements */ 528 AjUp[offset] = (PetscInt) i; 529 AAUp[offset] = (MatScalar)1./v[nz]; 530 AiUp[i] = AiUp[i+1] - (nz+1); 531 532 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 533 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 534 } 535 536 /* allocate space for the triangular factor information */ 537 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 538 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 539 540 /* Create the matrix description */ 541 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 542 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 543 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 544 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 545 #else 546 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 547 #endif 548 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 549 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 550 551 /* set the operation */ 552 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 553 554 /* set the matrix */ 555 upTriFactor->csrMat = new CsrMatrix; 556 upTriFactor->csrMat->num_rows = n; 557 upTriFactor->csrMat->num_cols = n; 558 upTriFactor->csrMat->num_entries = nzUpper; 559 560 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 561 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 562 563 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 564 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 565 566 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 567 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 568 569 /* Create the solve analysis information */ 570 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 571 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 572 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 573 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 574 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 575 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 576 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 577 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 578 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 579 #endif 580 581 /* perform the solve analysis */ 582 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 583 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 584 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 585 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 586 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 587 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 588 #endif 589 );CHKERRCUSPARSE(stat); 590 cerr = WaitForCUDA();CHKERRCUDA(cerr); 591 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 592 593 /* assign the pointer */ 594 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 595 596 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 597 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 598 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 599 } else { 600 /* Fill the upper triangular matrix */ 601 offset = nzUpper; 602 for (i=n-1; i>=0; i--) { 603 v = aa + adiag[i+1] + 1; 604 605 /* number of elements NOT on the diagonal */ 606 nz = adiag[i] - adiag[i+1]-1; 607 608 /* decrement the offset */ 609 offset -= (nz+1); 610 611 /* first, set the diagonal elements */ 612 AAUp[offset] = 1./v[nz]; 613 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 614 } 615 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 616 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 617 } 618 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 619 } catch(char *ex) { 620 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 621 } 622 } 623 PetscFunctionReturn(0); 624 } 625 626 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 627 { 628 PetscErrorCode ierr; 629 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 630 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 631 IS isrow = a->row,iscol = a->icol; 632 PetscBool row_identity,col_identity; 633 PetscInt n = A->rmap->n; 634 635 PetscFunctionBegin; 636 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 637 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 638 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 639 640 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 641 cusparseTriFactors->nnz=a->nz; 642 643 A->offloadmask = PETSC_OFFLOAD_BOTH; 644 /* lower triangular indices */ 645 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 646 if (!row_identity && !cusparseTriFactors->rpermIndices) { 647 const PetscInt *r; 648 649 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 650 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 651 cusparseTriFactors->rpermIndices->assign(r, r+n); 652 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 653 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 654 } 655 656 /* upper triangular indices */ 657 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 658 if (!col_identity && !cusparseTriFactors->cpermIndices) { 659 const PetscInt *c; 660 661 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 662 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 663 cusparseTriFactors->cpermIndices->assign(c, c+n); 664 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 665 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 666 } 667 PetscFunctionReturn(0); 668 } 669 670 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 671 { 672 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 673 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 674 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 675 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 676 cusparseStatus_t stat; 677 PetscErrorCode ierr; 678 cudaError_t cerr; 679 PetscInt *AiUp, *AjUp; 680 PetscScalar *AAUp; 681 PetscScalar *AALo; 682 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 683 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 684 const PetscInt *ai = b->i,*aj = b->j,*vj; 685 const MatScalar *aa = b->a,*v; 686 687 PetscFunctionBegin; 688 if (!n) PetscFunctionReturn(0); 689 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 690 try { 691 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 692 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 693 if (!upTriFactor && !loTriFactor) { 694 /* Allocate Space for the upper triangular matrix */ 695 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 696 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 697 698 /* Fill the upper triangular matrix */ 699 AiUp[0]=(PetscInt) 0; 700 AiUp[n]=nzUpper; 701 offset = 0; 702 for (i=0; i<n; i++) { 703 /* set the pointers */ 704 v = aa + ai[i]; 705 vj = aj + ai[i]; 706 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 707 708 /* first, set the diagonal elements */ 709 AjUp[offset] = (PetscInt) i; 710 AAUp[offset] = (MatScalar)1.0/v[nz]; 711 AiUp[i] = offset; 712 AALo[offset] = (MatScalar)1.0/v[nz]; 713 714 offset+=1; 715 if (nz>0) { 716 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 717 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 718 for (j=offset; j<offset+nz; j++) { 719 AAUp[j] = -AAUp[j]; 720 AALo[j] = AAUp[j]/v[nz]; 721 } 722 offset+=nz; 723 } 724 } 725 726 /* allocate space for the triangular factor information */ 727 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 728 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 729 730 /* Create the matrix description */ 731 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 732 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 733 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 734 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 735 #else 736 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 737 #endif 738 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 739 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 740 741 /* set the matrix */ 742 upTriFactor->csrMat = new CsrMatrix; 743 upTriFactor->csrMat->num_rows = A->rmap->n; 744 upTriFactor->csrMat->num_cols = A->cmap->n; 745 upTriFactor->csrMat->num_entries = a->nz; 746 747 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 748 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 749 750 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 751 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 752 753 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 754 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 755 756 /* set the operation */ 757 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 758 759 /* Create the solve analysis information */ 760 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 761 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 762 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 763 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 764 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 765 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 766 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 767 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 768 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 769 #endif 770 771 /* perform the solve analysis */ 772 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 773 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 774 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 775 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 776 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 777 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 778 #endif 779 );CHKERRCUSPARSE(stat); 780 cerr = WaitForCUDA();CHKERRCUDA(cerr); 781 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 782 783 /* assign the pointer */ 784 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 785 786 /* allocate space for the triangular factor information */ 787 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 788 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 789 790 /* Create the matrix description */ 791 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 792 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 793 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 794 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 795 #else 796 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 797 #endif 798 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 799 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 800 801 /* set the operation */ 802 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 803 804 /* set the matrix */ 805 loTriFactor->csrMat = new CsrMatrix; 806 loTriFactor->csrMat->num_rows = A->rmap->n; 807 loTriFactor->csrMat->num_cols = A->cmap->n; 808 loTriFactor->csrMat->num_entries = a->nz; 809 810 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 811 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 812 813 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 814 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 815 816 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 817 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 818 819 /* Create the solve analysis information */ 820 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 821 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 822 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 823 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 824 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 825 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 826 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 827 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 828 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 829 #endif 830 831 /* perform the solve analysis */ 832 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 833 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 834 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 835 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 836 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 837 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 838 #endif 839 );CHKERRCUSPARSE(stat); 840 cerr = WaitForCUDA();CHKERRCUDA(cerr); 841 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 842 843 /* assign the pointer */ 844 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 845 846 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 847 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 848 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 849 } else { 850 /* Fill the upper triangular matrix */ 851 offset = 0; 852 for (i=0; i<n; i++) { 853 /* set the pointers */ 854 v = aa + ai[i]; 855 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 856 857 /* first, set the diagonal elements */ 858 AAUp[offset] = 1.0/v[nz]; 859 AALo[offset] = 1.0/v[nz]; 860 861 offset+=1; 862 if (nz>0) { 863 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 864 for (j=offset; j<offset+nz; j++) { 865 AAUp[j] = -AAUp[j]; 866 AALo[j] = AAUp[j]/v[nz]; 867 } 868 offset+=nz; 869 } 870 } 871 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 872 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 873 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 874 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 875 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 876 } 877 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 878 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 879 } catch(char *ex) { 880 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 881 } 882 } 883 PetscFunctionReturn(0); 884 } 885 886 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 887 { 888 PetscErrorCode ierr; 889 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 890 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 891 IS ip = a->row; 892 PetscBool perm_identity; 893 PetscInt n = A->rmap->n; 894 895 PetscFunctionBegin; 896 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 897 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 898 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 899 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 900 901 A->offloadmask = PETSC_OFFLOAD_BOTH; 902 903 /* lower triangular indices */ 904 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 905 if (!perm_identity) { 906 IS iip; 907 const PetscInt *irip,*rip; 908 909 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 910 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 911 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 912 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 913 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 914 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 915 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 916 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 917 ierr = ISDestroy(&iip);CHKERRQ(ierr); 918 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 919 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 920 } 921 PetscFunctionReturn(0); 922 } 923 924 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 925 { 926 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 927 IS isrow = b->row,iscol = b->col; 928 PetscBool row_identity,col_identity; 929 PetscErrorCode ierr; 930 931 PetscFunctionBegin; 932 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 933 B->offloadmask = PETSC_OFFLOAD_CPU; 934 /* determine which version of MatSolve needs to be used. */ 935 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 936 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 937 if (row_identity && col_identity) { 938 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 939 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 940 B->ops->matsolve = NULL; 941 B->ops->matsolvetranspose = NULL; 942 } else { 943 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 944 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 945 B->ops->matsolve = NULL; 946 B->ops->matsolvetranspose = NULL; 947 } 948 949 /* get the triangular factors */ 950 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 951 PetscFunctionReturn(0); 952 } 953 954 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 955 { 956 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 957 IS ip = b->row; 958 PetscBool perm_identity; 959 PetscErrorCode ierr; 960 961 PetscFunctionBegin; 962 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 963 B->offloadmask = PETSC_OFFLOAD_CPU; 964 /* determine which version of MatSolve needs to be used. */ 965 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 966 if (perm_identity) { 967 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 968 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 969 B->ops->matsolve = NULL; 970 B->ops->matsolvetranspose = NULL; 971 } else { 972 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 973 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 974 B->ops->matsolve = NULL; 975 B->ops->matsolvetranspose = NULL; 976 } 977 978 /* get the triangular factors */ 979 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 980 PetscFunctionReturn(0); 981 } 982 983 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 984 { 985 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 986 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 987 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 988 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 989 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 990 cusparseStatus_t stat; 991 cusparseIndexBase_t indexBase; 992 cusparseMatrixType_t matrixType; 993 cusparseFillMode_t fillMode; 994 cusparseDiagType_t diagType; 995 cudaError_t cerr; 996 PetscErrorCode ierr; 997 998 PetscFunctionBegin; 999 /* allocate space for the transpose of the lower triangular factor */ 1000 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1001 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1002 1003 /* set the matrix descriptors of the lower triangular factor */ 1004 matrixType = cusparseGetMatType(loTriFactor->descr); 1005 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1006 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1007 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1008 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1009 1010 /* Create the matrix description */ 1011 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1012 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1013 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1014 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1015 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1016 1017 /* set the operation */ 1018 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1019 1020 /* allocate GPU space for the CSC of the lower triangular factor*/ 1021 loTriFactorT->csrMat = new CsrMatrix; 1022 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1023 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1024 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1025 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1026 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1027 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1028 1029 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1030 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1031 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1032 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1033 loTriFactor->csrMat->values->data().get(), 1034 loTriFactor->csrMat->row_offsets->data().get(), 1035 loTriFactor->csrMat->column_indices->data().get(), 1036 loTriFactorT->csrMat->values->data().get(), 1037 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1038 CUSPARSE_ACTION_NUMERIC,indexBase, 1039 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1040 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1041 #endif 1042 1043 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1044 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1045 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1046 loTriFactor->csrMat->values->data().get(), 1047 loTriFactor->csrMat->row_offsets->data().get(), 1048 loTriFactor->csrMat->column_indices->data().get(), 1049 loTriFactorT->csrMat->values->data().get(), 1050 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1051 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1052 CUSPARSE_ACTION_NUMERIC, indexBase, 1053 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 1054 #else 1055 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1056 CUSPARSE_ACTION_NUMERIC, indexBase 1057 #endif 1058 );CHKERRCUSPARSE(stat); 1059 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1060 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1061 1062 /* Create the solve analysis information */ 1063 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1064 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1065 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1066 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1067 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1068 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1069 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1070 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1071 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1072 #endif 1073 1074 /* perform the solve analysis */ 1075 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1076 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1077 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1078 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 1079 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1080 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1081 #endif 1082 );CHKERRCUSPARSE(stat); 1083 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1084 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1085 1086 /* assign the pointer */ 1087 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1088 1089 /*********************************************/ 1090 /* Now the Transpose of the Upper Tri Factor */ 1091 /*********************************************/ 1092 1093 /* allocate space for the transpose of the upper triangular factor */ 1094 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1095 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1096 1097 /* set the matrix descriptors of the upper triangular factor */ 1098 matrixType = cusparseGetMatType(upTriFactor->descr); 1099 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1100 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1101 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1102 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1103 1104 /* Create the matrix description */ 1105 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1106 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1107 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1108 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1109 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1110 1111 /* set the operation */ 1112 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1113 1114 /* allocate GPU space for the CSC of the upper triangular factor*/ 1115 upTriFactorT->csrMat = new CsrMatrix; 1116 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1117 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1118 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1119 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1120 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1121 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1122 1123 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1124 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1125 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1126 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1127 upTriFactor->csrMat->values->data().get(), 1128 upTriFactor->csrMat->row_offsets->data().get(), 1129 upTriFactor->csrMat->column_indices->data().get(), 1130 upTriFactorT->csrMat->values->data().get(), 1131 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1132 CUSPARSE_ACTION_NUMERIC,indexBase, 1133 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1134 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1135 #endif 1136 1137 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1138 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1139 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1140 upTriFactor->csrMat->values->data().get(), 1141 upTriFactor->csrMat->row_offsets->data().get(), 1142 upTriFactor->csrMat->column_indices->data().get(), 1143 upTriFactorT->csrMat->values->data().get(), 1144 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1145 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1146 CUSPARSE_ACTION_NUMERIC, indexBase, 1147 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1148 #else 1149 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1150 CUSPARSE_ACTION_NUMERIC, indexBase 1151 #endif 1152 );CHKERRCUSPARSE(stat); 1153 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1154 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1155 1156 /* Create the solve analysis information */ 1157 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1158 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1159 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1160 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1161 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1162 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1163 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1164 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1165 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1166 #endif 1167 1168 /* perform the solve analysis */ 1169 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1170 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1171 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1172 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1173 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1174 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1175 #endif 1176 );CHKERRCUSPARSE(stat); 1177 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1178 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1179 1180 /* assign the pointer */ 1181 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1182 PetscFunctionReturn(0); 1183 } 1184 1185 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 1186 { 1187 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1188 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1189 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1190 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1191 cusparseStatus_t stat; 1192 cusparseIndexBase_t indexBase; 1193 cudaError_t err; 1194 PetscErrorCode ierr; 1195 1196 PetscFunctionBegin; 1197 if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0); 1198 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1199 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1200 /* create cusparse matrix */ 1201 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1202 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1203 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1204 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1205 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1206 1207 /* set alpha and beta */ 1208 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1209 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1210 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1211 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1212 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1213 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1214 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1215 1216 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1217 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1218 CsrMatrix *matrixT= new CsrMatrix; 1219 matrixT->num_rows = A->cmap->n; 1220 matrixT->num_cols = A->rmap->n; 1221 matrixT->num_entries = a->nz; 1222 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1223 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1224 matrixT->values = new THRUSTARRAY(a->nz); 1225 1226 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 1227 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1228 1229 /* compute the transpose, i.e. the CSC */ 1230 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1231 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1232 A->cmap->n, matrix->num_entries, 1233 matrix->values->data().get(), 1234 cusparsestruct->rowoffsets_gpu->data().get(), 1235 matrix->column_indices->data().get(), 1236 matrixT->values->data().get(), 1237 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1238 CUSPARSE_ACTION_NUMERIC,indexBase, 1239 cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1240 err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err); 1241 #endif 1242 1243 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1244 A->cmap->n, matrix->num_entries, 1245 matrix->values->data().get(), 1246 cusparsestruct->rowoffsets_gpu->data().get(), 1247 matrix->column_indices->data().get(), 1248 matrixT->values->data().get(), 1249 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1250 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1251 CUSPARSE_ACTION_NUMERIC,indexBase, 1252 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1253 #else 1254 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1255 CUSPARSE_ACTION_NUMERIC, indexBase 1256 #endif 1257 );CHKERRCUSPARSE(stat); 1258 matstructT->mat = matrixT; 1259 1260 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1261 stat = cusparseCreateCsr(&matstructT->matDescr, 1262 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1263 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1264 matrixT->values->data().get(), 1265 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1266 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1267 #endif 1268 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1269 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1270 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1271 #else 1272 CsrMatrix *temp = new CsrMatrix; 1273 CsrMatrix *tempT = new CsrMatrix; 1274 /* First convert HYB to CSR */ 1275 temp->num_rows = A->rmap->n; 1276 temp->num_cols = A->cmap->n; 1277 temp->num_entries = a->nz; 1278 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1279 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1280 temp->values = new THRUSTARRAY(a->nz); 1281 1282 stat = cusparse_hyb2csr(cusparsestruct->handle, 1283 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1284 temp->values->data().get(), 1285 temp->row_offsets->data().get(), 1286 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1287 1288 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1289 tempT->num_rows = A->rmap->n; 1290 tempT->num_cols = A->cmap->n; 1291 tempT->num_entries = a->nz; 1292 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1293 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1294 tempT->values = new THRUSTARRAY(a->nz); 1295 1296 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1297 temp->num_cols, temp->num_entries, 1298 temp->values->data().get(), 1299 temp->row_offsets->data().get(), 1300 temp->column_indices->data().get(), 1301 tempT->values->data().get(), 1302 tempT->column_indices->data().get(), 1303 tempT->row_offsets->data().get(), 1304 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1305 1306 /* Last, convert CSC to HYB */ 1307 cusparseHybMat_t hybMat; 1308 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1309 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1310 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1311 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1312 matstructT->descr, tempT->values->data().get(), 1313 tempT->row_offsets->data().get(), 1314 tempT->column_indices->data().get(), 1315 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1316 1317 /* assign the pointer */ 1318 matstructT->mat = hybMat; 1319 /* delete temporaries */ 1320 if (tempT) { 1321 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1322 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1323 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1324 delete (CsrMatrix*) tempT; 1325 } 1326 if (temp) { 1327 if (temp->values) delete (THRUSTARRAY*) temp->values; 1328 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1329 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1330 delete (CsrMatrix*) temp; 1331 } 1332 #endif 1333 } 1334 err = WaitForCUDA();CHKERRCUDA(err); 1335 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1336 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1337 /* the compressed row indices is not used for matTranspose */ 1338 matstructT->cprowIndices = NULL; 1339 /* assign the pointer */ 1340 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1341 PetscFunctionReturn(0); 1342 } 1343 1344 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1345 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1346 { 1347 PetscInt n = xx->map->n; 1348 const PetscScalar *barray; 1349 PetscScalar *xarray; 1350 thrust::device_ptr<const PetscScalar> bGPU; 1351 thrust::device_ptr<PetscScalar> xGPU; 1352 cusparseStatus_t stat; 1353 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1354 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1355 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1356 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1357 PetscErrorCode ierr; 1358 cudaError_t cerr; 1359 1360 PetscFunctionBegin; 1361 /* Analyze the matrix and create the transpose ... on the fly */ 1362 if (!loTriFactorT && !upTriFactorT) { 1363 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1364 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1365 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1366 } 1367 1368 /* Get the GPU pointers */ 1369 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1370 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1371 xGPU = thrust::device_pointer_cast(xarray); 1372 bGPU = thrust::device_pointer_cast(barray); 1373 1374 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1375 /* First, reorder with the row permutation */ 1376 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1377 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1378 xGPU); 1379 1380 /* First, solve U */ 1381 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1382 upTriFactorT->csrMat->num_rows, 1383 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1384 upTriFactorT->csrMat->num_entries, 1385 #endif 1386 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1387 upTriFactorT->csrMat->values->data().get(), 1388 upTriFactorT->csrMat->row_offsets->data().get(), 1389 upTriFactorT->csrMat->column_indices->data().get(), 1390 upTriFactorT->solveInfo, 1391 xarray, tempGPU->data().get() 1392 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1393 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1394 #endif 1395 );CHKERRCUSPARSE(stat); 1396 1397 /* Then, solve L */ 1398 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1399 loTriFactorT->csrMat->num_rows, 1400 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1401 loTriFactorT->csrMat->num_entries, 1402 #endif 1403 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1404 loTriFactorT->csrMat->values->data().get(), 1405 loTriFactorT->csrMat->row_offsets->data().get(), 1406 loTriFactorT->csrMat->column_indices->data().get(), 1407 loTriFactorT->solveInfo, 1408 tempGPU->data().get(), xarray 1409 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1410 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1411 #endif 1412 );CHKERRCUSPARSE(stat); 1413 1414 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1415 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1416 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1417 tempGPU->begin()); 1418 1419 /* Copy the temporary to the full solution. */ 1420 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1421 1422 /* restore */ 1423 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1424 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1425 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1426 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1427 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1428 PetscFunctionReturn(0); 1429 } 1430 1431 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1432 { 1433 const PetscScalar *barray; 1434 PetscScalar *xarray; 1435 cusparseStatus_t stat; 1436 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1437 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1438 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1439 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1440 PetscErrorCode ierr; 1441 cudaError_t cerr; 1442 1443 PetscFunctionBegin; 1444 /* Analyze the matrix and create the transpose ... on the fly */ 1445 if (!loTriFactorT && !upTriFactorT) { 1446 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1447 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1448 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1449 } 1450 1451 /* Get the GPU pointers */ 1452 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1453 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1454 1455 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1456 /* First, solve U */ 1457 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1458 upTriFactorT->csrMat->num_rows, 1459 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1460 upTriFactorT->csrMat->num_entries, 1461 #endif 1462 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1463 upTriFactorT->csrMat->values->data().get(), 1464 upTriFactorT->csrMat->row_offsets->data().get(), 1465 upTriFactorT->csrMat->column_indices->data().get(), 1466 upTriFactorT->solveInfo, 1467 barray, tempGPU->data().get() 1468 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1469 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1470 #endif 1471 );CHKERRCUSPARSE(stat); 1472 1473 /* Then, solve L */ 1474 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1475 loTriFactorT->csrMat->num_rows, 1476 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1477 loTriFactorT->csrMat->num_entries, 1478 #endif 1479 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1480 loTriFactorT->csrMat->values->data().get(), 1481 loTriFactorT->csrMat->row_offsets->data().get(), 1482 loTriFactorT->csrMat->column_indices->data().get(), 1483 loTriFactorT->solveInfo, 1484 tempGPU->data().get(), xarray 1485 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1486 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1487 #endif 1488 );CHKERRCUSPARSE(stat); 1489 1490 /* restore */ 1491 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1492 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1493 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1494 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1495 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1496 PetscFunctionReturn(0); 1497 } 1498 1499 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1500 { 1501 const PetscScalar *barray; 1502 PetscScalar *xarray; 1503 thrust::device_ptr<const PetscScalar> bGPU; 1504 thrust::device_ptr<PetscScalar> xGPU; 1505 cusparseStatus_t stat; 1506 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1507 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1508 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1509 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1510 PetscErrorCode ierr; 1511 cudaError_t cerr; 1512 1513 PetscFunctionBegin; 1514 1515 /* Get the GPU pointers */ 1516 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1517 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1518 xGPU = thrust::device_pointer_cast(xarray); 1519 bGPU = thrust::device_pointer_cast(barray); 1520 1521 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1522 /* First, reorder with the row permutation */ 1523 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1524 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1525 tempGPU->begin()); 1526 1527 /* Next, solve L */ 1528 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1529 loTriFactor->csrMat->num_rows, 1530 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1531 loTriFactor->csrMat->num_entries, 1532 #endif 1533 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1534 loTriFactor->csrMat->values->data().get(), 1535 loTriFactor->csrMat->row_offsets->data().get(), 1536 loTriFactor->csrMat->column_indices->data().get(), 1537 loTriFactor->solveInfo, 1538 tempGPU->data().get(), xarray 1539 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1540 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1541 #endif 1542 );CHKERRCUSPARSE(stat); 1543 1544 /* Then, solve U */ 1545 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1546 upTriFactor->csrMat->num_rows, 1547 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1548 upTriFactor->csrMat->num_entries, 1549 #endif 1550 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1551 upTriFactor->csrMat->values->data().get(), 1552 upTriFactor->csrMat->row_offsets->data().get(), 1553 upTriFactor->csrMat->column_indices->data().get(), 1554 upTriFactor->solveInfo, 1555 xarray, tempGPU->data().get() 1556 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1557 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1558 #endif 1559 );CHKERRCUSPARSE(stat); 1560 1561 /* Last, reorder with the column permutation */ 1562 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1563 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1564 xGPU); 1565 1566 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1567 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1568 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1569 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1570 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1571 PetscFunctionReturn(0); 1572 } 1573 1574 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1575 { 1576 const PetscScalar *barray; 1577 PetscScalar *xarray; 1578 cusparseStatus_t stat; 1579 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1580 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1581 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1582 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1583 PetscErrorCode ierr; 1584 cudaError_t cerr; 1585 1586 PetscFunctionBegin; 1587 /* Get the GPU pointers */ 1588 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1589 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1590 1591 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1592 /* First, solve L */ 1593 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1594 loTriFactor->csrMat->num_rows, 1595 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1596 loTriFactor->csrMat->num_entries, 1597 #endif 1598 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1599 loTriFactor->csrMat->values->data().get(), 1600 loTriFactor->csrMat->row_offsets->data().get(), 1601 loTriFactor->csrMat->column_indices->data().get(), 1602 loTriFactor->solveInfo, 1603 barray, tempGPU->data().get() 1604 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1605 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1606 #endif 1607 );CHKERRCUSPARSE(stat); 1608 1609 /* Next, solve U */ 1610 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1611 upTriFactor->csrMat->num_rows, 1612 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1613 upTriFactor->csrMat->num_entries, 1614 #endif 1615 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1616 upTriFactor->csrMat->values->data().get(), 1617 upTriFactor->csrMat->row_offsets->data().get(), 1618 upTriFactor->csrMat->column_indices->data().get(), 1619 upTriFactor->solveInfo, 1620 tempGPU->data().get(), xarray 1621 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1622 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1623 #endif 1624 );CHKERRCUSPARSE(stat); 1625 1626 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1627 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1628 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1629 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1630 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1631 PetscFunctionReturn(0); 1632 } 1633 1634 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1635 { 1636 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1637 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1638 cudaError_t cerr; 1639 PetscErrorCode ierr; 1640 1641 PetscFunctionBegin; 1642 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1643 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1644 1645 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1646 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1647 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1648 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1649 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1650 A->offloadmask = PETSC_OFFLOAD_BOTH; 1651 } 1652 PetscFunctionReturn(0); 1653 } 1654 1655 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1656 { 1657 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1658 PetscErrorCode ierr; 1659 1660 PetscFunctionBegin; 1661 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1662 *array = a->a; 1663 A->offloadmask = PETSC_OFFLOAD_CPU; 1664 PetscFunctionReturn(0); 1665 } 1666 1667 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1668 { 1669 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1670 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1671 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1672 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1673 PetscErrorCode ierr; 1674 cusparseStatus_t stat; 1675 cudaError_t err; 1676 1677 PetscFunctionBegin; 1678 if (A->boundtocpu) PetscFunctionReturn(0); 1679 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1680 if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1681 /* Copy values only */ 1682 CsrMatrix *matrix,*matrixT; 1683 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1684 1685 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1686 matrix->values->assign(a->a, a->a+a->nz); 1687 err = WaitForCUDA();CHKERRCUDA(err); 1688 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1689 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1690 1691 /* Update matT when it was built before */ 1692 if (cusparsestruct->matTranspose) { 1693 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1694 matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1695 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1696 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1697 A->cmap->n, matrix->num_entries, 1698 matrix->values->data().get(), 1699 cusparsestruct->rowoffsets_gpu->data().get(), 1700 matrix->column_indices->data().get(), 1701 matrixT->values->data().get(), 1702 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1703 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1704 CUSPARSE_ACTION_NUMERIC,indexBase, 1705 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1706 #else 1707 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1708 CUSPARSE_ACTION_NUMERIC, indexBase 1709 #endif 1710 );CHKERRCUSPARSE(stat); 1711 err = WaitForCUDA();CHKERRCUDA(err); 1712 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1713 } 1714 } else { 1715 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1716 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1717 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1718 delete cusparsestruct->workVector; 1719 delete cusparsestruct->rowoffsets_gpu; 1720 try { 1721 if (a->compressedrow.use) { 1722 m = a->compressedrow.nrows; 1723 ii = a->compressedrow.i; 1724 ridx = a->compressedrow.rindex; 1725 } else { 1726 m = A->rmap->n; 1727 ii = a->i; 1728 ridx = NULL; 1729 } 1730 cusparsestruct->nrows = m; 1731 1732 /* create cusparse matrix */ 1733 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1734 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1735 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1736 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1737 1738 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1739 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1740 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1741 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1742 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1743 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1744 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1745 1746 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1747 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1748 /* set the matrix */ 1749 CsrMatrix *mat= new CsrMatrix; 1750 mat->num_rows = m; 1751 mat->num_cols = A->cmap->n; 1752 mat->num_entries = a->nz; 1753 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1754 mat->row_offsets->assign(ii, ii + m+1); 1755 1756 mat->column_indices = new THRUSTINTARRAY32(a->nz); 1757 mat->column_indices->assign(a->j, a->j+a->nz); 1758 1759 mat->values = new THRUSTARRAY(a->nz); 1760 mat->values->assign(a->a, a->a+a->nz); 1761 1762 /* assign the pointer */ 1763 matstruct->mat = mat; 1764 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1765 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1766 stat = cusparseCreateCsr(&matstruct->matDescr, 1767 mat->num_rows, mat->num_cols, mat->num_entries, 1768 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1769 mat->values->data().get(), 1770 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1771 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1772 } 1773 #endif 1774 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1775 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1776 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1777 #else 1778 CsrMatrix *mat= new CsrMatrix; 1779 mat->num_rows = m; 1780 mat->num_cols = A->cmap->n; 1781 mat->num_entries = a->nz; 1782 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1783 mat->row_offsets->assign(ii, ii + m+1); 1784 1785 mat->column_indices = new THRUSTINTARRAY32(a->nz); 1786 mat->column_indices->assign(a->j, a->j+a->nz); 1787 1788 mat->values = new THRUSTARRAY(a->nz); 1789 mat->values->assign(a->a, a->a+a->nz); 1790 1791 cusparseHybMat_t hybMat; 1792 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1793 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1794 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1795 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1796 matstruct->descr, mat->values->data().get(), 1797 mat->row_offsets->data().get(), 1798 mat->column_indices->data().get(), 1799 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1800 /* assign the pointer */ 1801 matstruct->mat = hybMat; 1802 1803 if (mat) { 1804 if (mat->values) delete (THRUSTARRAY*)mat->values; 1805 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1806 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1807 delete (CsrMatrix*)mat; 1808 } 1809 #endif 1810 } 1811 1812 /* assign the compressed row indices */ 1813 if (a->compressedrow.use) { 1814 cusparsestruct->workVector = new THRUSTARRAY(m); 1815 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1816 matstruct->cprowIndices->assign(ridx,ridx+m); 1817 tmp = m; 1818 } else { 1819 cusparsestruct->workVector = NULL; 1820 matstruct->cprowIndices = NULL; 1821 tmp = 0; 1822 } 1823 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1824 1825 /* assign the pointer */ 1826 cusparsestruct->mat = matstruct; 1827 } catch(char *ex) { 1828 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1829 } 1830 err = WaitForCUDA();CHKERRCUDA(err); 1831 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1832 cusparsestruct->nonzerostate = A->nonzerostate; 1833 } 1834 A->offloadmask = PETSC_OFFLOAD_BOTH; 1835 } 1836 PetscFunctionReturn(0); 1837 } 1838 1839 struct VecCUDAPlusEquals 1840 { 1841 template <typename Tuple> 1842 __host__ __device__ 1843 void operator()(Tuple t) 1844 { 1845 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1846 } 1847 }; 1848 1849 struct VecCUDAEquals 1850 { 1851 template <typename Tuple> 1852 __host__ __device__ 1853 void operator()(Tuple t) 1854 { 1855 thrust::get<1>(t) = thrust::get<0>(t); 1856 } 1857 }; 1858 1859 struct VecCUDAEqualsReverse 1860 { 1861 template <typename Tuple> 1862 __host__ __device__ 1863 void operator()(Tuple t) 1864 { 1865 thrust::get<0>(t) = thrust::get<1>(t); 1866 } 1867 }; 1868 1869 struct MatMatCusparse { 1870 PetscBool cisdense; 1871 PetscScalar *Bt; 1872 Mat X; 1873 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1874 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1875 cusparseDnMatDescr_t matBDescr; 1876 cusparseDnMatDescr_t matCDescr; 1877 size_t spmmBufferSize; 1878 void *spmmBuffer; 1879 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1880 #endif 1881 }; 1882 1883 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1884 { 1885 PetscErrorCode ierr; 1886 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1887 cudaError_t cerr; 1888 1889 PetscFunctionBegin; 1890 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1891 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1892 cusparseStatus_t stat; 1893 if (mmdata->matBDescr) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);} 1894 if (mmdata->matCDescr) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);} 1895 if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 1896 #endif 1897 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1898 ierr = PetscFree(data);CHKERRQ(ierr); 1899 PetscFunctionReturn(0); 1900 } 1901 1902 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1903 1904 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1905 { 1906 Mat_Product *product = C->product; 1907 Mat A,B; 1908 PetscInt m,n,blda,clda; 1909 PetscBool flg,biscuda; 1910 Mat_SeqAIJCUSPARSE *cusp; 1911 cusparseStatus_t stat; 1912 cusparseOperation_t opA; 1913 const PetscScalar *barray; 1914 PetscScalar *carray; 1915 PetscErrorCode ierr; 1916 MatMatCusparse *mmdata; 1917 Mat_SeqAIJCUSPARSEMultStruct *mat; 1918 CsrMatrix *csrmat; 1919 cudaError_t cerr; 1920 1921 PetscFunctionBegin; 1922 MatCheckProduct(C,1); 1923 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1924 mmdata = (MatMatCusparse*)product->data; 1925 A = product->A; 1926 B = product->B; 1927 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1928 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1929 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1930 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1931 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1932 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1933 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1934 switch (product->type) { 1935 case MATPRODUCT_AB: 1936 case MATPRODUCT_PtAP: 1937 mat = cusp->mat; 1938 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1939 m = A->rmap->n; 1940 n = B->cmap->n; 1941 break; 1942 case MATPRODUCT_AtB: 1943 if (!cusp->transgen) { 1944 mat = cusp->mat; 1945 opA = CUSPARSE_OPERATION_TRANSPOSE; 1946 } else { 1947 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1948 mat = cusp->matTranspose; 1949 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1950 } 1951 m = A->cmap->n; 1952 n = B->cmap->n; 1953 break; 1954 case MATPRODUCT_ABt: 1955 case MATPRODUCT_RARt: 1956 mat = cusp->mat; 1957 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1958 m = A->rmap->n; 1959 n = B->rmap->n; 1960 break; 1961 default: 1962 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1963 } 1964 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1965 csrmat = (CsrMatrix*)mat->mat; 1966 /* if the user passed a CPU matrix, copy the data to the GPU */ 1967 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1968 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 1969 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 1970 1971 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 1972 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1973 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1974 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 1975 } else { 1976 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 1977 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 1978 } 1979 1980 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1981 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1982 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 1983 /* (re)allcoate spmmBuffer if not initialized or LDAs are different */ 1984 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 1985 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 1986 if (!mmdata->matBDescr) { 1987 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 1988 mmdata->Blda = blda; 1989 } 1990 1991 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 1992 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 1993 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 1994 mmdata->Clda = clda; 1995 } 1996 1997 if (!mat->matDescr) { 1998 stat = cusparseCreateCsr(&mat->matDescr, 1999 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2000 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2001 csrmat->values->data().get(), 2002 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2003 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2004 } 2005 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2006 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2007 mmdata->matCDescr,cusparse_scalartype, 2008 cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat); 2009 if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 2010 cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr); 2011 mmdata->initialized = PETSC_TRUE; 2012 } else { 2013 /* to be safe, always update pointers of the mats */ 2014 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2015 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2016 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2017 } 2018 2019 /* do cusparseSpMM, which supports transpose on B */ 2020 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2021 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2022 mmdata->matCDescr,cusparse_scalartype, 2023 cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat); 2024 #else 2025 PetscInt k; 2026 /* cusparseXcsrmm does not support transpose on B */ 2027 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2028 cublasHandle_t cublasv2handle; 2029 cublasStatus_t cerr; 2030 2031 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2032 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2033 B->cmap->n,B->rmap->n, 2034 &PETSC_CUSPARSE_ONE ,barray,blda, 2035 &PETSC_CUSPARSE_ZERO,barray,blda, 2036 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2037 blda = B->cmap->n; 2038 k = B->cmap->n; 2039 } else { 2040 k = B->rmap->n; 2041 } 2042 2043 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2044 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2045 csrmat->num_entries,mat->alpha_one,mat->descr, 2046 csrmat->values->data().get(), 2047 csrmat->row_offsets->data().get(), 2048 csrmat->column_indices->data().get(), 2049 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2050 carray,clda);CHKERRCUSPARSE(stat); 2051 #endif 2052 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2053 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2054 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2055 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2056 if (product->type == MATPRODUCT_RARt) { 2057 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2058 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2059 } else if (product->type == MATPRODUCT_PtAP) { 2060 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2061 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2062 } else { 2063 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2064 } 2065 if (mmdata->cisdense) { 2066 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2067 } 2068 if (!biscuda) { 2069 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2070 } 2071 PetscFunctionReturn(0); 2072 } 2073 2074 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2075 { 2076 Mat_Product *product = C->product; 2077 Mat A,B; 2078 PetscInt m,n; 2079 PetscBool cisdense,flg; 2080 PetscErrorCode ierr; 2081 MatMatCusparse *mmdata; 2082 Mat_SeqAIJCUSPARSE *cusp; 2083 2084 PetscFunctionBegin; 2085 MatCheckProduct(C,1); 2086 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2087 A = product->A; 2088 B = product->B; 2089 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2090 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2091 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2092 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2093 switch (product->type) { 2094 case MATPRODUCT_AB: 2095 m = A->rmap->n; 2096 n = B->cmap->n; 2097 break; 2098 case MATPRODUCT_AtB: 2099 m = A->cmap->n; 2100 n = B->cmap->n; 2101 break; 2102 case MATPRODUCT_ABt: 2103 m = A->rmap->n; 2104 n = B->rmap->n; 2105 break; 2106 case MATPRODUCT_PtAP: 2107 m = B->cmap->n; 2108 n = B->cmap->n; 2109 break; 2110 case MATPRODUCT_RARt: 2111 m = B->rmap->n; 2112 n = B->rmap->n; 2113 break; 2114 default: 2115 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2116 } 2117 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2118 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2119 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2120 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2121 2122 /* product data */ 2123 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2124 mmdata->cisdense = cisdense; 2125 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2126 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2127 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2128 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2129 } 2130 #endif 2131 /* for these products we need intermediate storage */ 2132 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2133 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2134 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2135 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2136 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2137 } else { 2138 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2139 } 2140 } 2141 C->product->data = mmdata; 2142 C->product->destroy = MatDestroy_MatMatCusparse; 2143 2144 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2145 PetscFunctionReturn(0); 2146 } 2147 2148 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2149 2150 /* handles dense B */ 2151 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C) 2152 { 2153 Mat_Product *product = C->product; 2154 PetscErrorCode ierr; 2155 2156 PetscFunctionBegin; 2157 MatCheckProduct(C,1); 2158 if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 2159 if (product->A->boundtocpu) { 2160 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr); 2161 PetscFunctionReturn(0); 2162 } 2163 switch (product->type) { 2164 case MATPRODUCT_AB: 2165 case MATPRODUCT_AtB: 2166 case MATPRODUCT_ABt: 2167 case MATPRODUCT_PtAP: 2168 case MATPRODUCT_RARt: 2169 C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2170 default: 2171 break; 2172 } 2173 PetscFunctionReturn(0); 2174 } 2175 2176 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2177 { 2178 PetscErrorCode ierr; 2179 2180 PetscFunctionBegin; 2181 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2182 PetscFunctionReturn(0); 2183 } 2184 2185 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2186 { 2187 PetscErrorCode ierr; 2188 2189 PetscFunctionBegin; 2190 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2191 PetscFunctionReturn(0); 2192 } 2193 2194 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2195 { 2196 PetscErrorCode ierr; 2197 2198 PetscFunctionBegin; 2199 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2200 PetscFunctionReturn(0); 2201 } 2202 2203 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2204 { 2205 PetscErrorCode ierr; 2206 2207 PetscFunctionBegin; 2208 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2209 PetscFunctionReturn(0); 2210 } 2211 2212 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2213 { 2214 PetscErrorCode ierr; 2215 2216 PetscFunctionBegin; 2217 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2218 PetscFunctionReturn(0); 2219 } 2220 2221 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2222 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2223 { 2224 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2225 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2226 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2227 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2228 PetscErrorCode ierr; 2229 cudaError_t cerr; 2230 cusparseStatus_t stat; 2231 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2232 PetscBool compressed; 2233 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2234 PetscInt nx,ny; 2235 #endif 2236 2237 PetscFunctionBegin; 2238 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2239 if (!a->nonzerorowcnt) { 2240 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2241 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2242 PetscFunctionReturn(0); 2243 } 2244 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2245 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2246 if (!trans) { 2247 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2248 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2249 } else { 2250 if (herm || !cusparsestruct->transgen) { 2251 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2252 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2253 } else { 2254 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);} 2255 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2256 } 2257 } 2258 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2259 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2260 2261 try { 2262 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2263 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2264 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2265 2266 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2267 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2268 /* z = A x + beta y. 2269 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2270 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2271 */ 2272 xptr = xarray; 2273 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2274 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2275 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2276 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2277 allocated to accommodate different uses. So we get the length info directly from mat. 2278 */ 2279 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2280 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2281 nx = mat->num_cols; 2282 ny = mat->num_rows; 2283 } 2284 #endif 2285 } else { 2286 /* z = A^T x + beta y 2287 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2288 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2289 */ 2290 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2291 dptr = zarray; 2292 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2293 if (compressed) { /* Scatter x to work vector */ 2294 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2295 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2296 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2297 VecCUDAEqualsReverse()); 2298 } 2299 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2300 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2301 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2302 nx = mat->num_rows; 2303 ny = mat->num_cols; 2304 } 2305 #endif 2306 } 2307 2308 /* csr_spmv does y = alpha op(A) x + beta y */ 2309 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2310 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2311 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2312 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2313 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2314 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2315 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2316 matstruct->matDescr, 2317 matstruct->cuSpMV[opA].vecXDescr, beta, 2318 matstruct->cuSpMV[opA].vecYDescr, 2319 cusparse_scalartype, 2320 cusparsestruct->spmvAlg, 2321 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2322 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2323 2324 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2325 } else { 2326 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2327 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2328 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2329 } 2330 2331 stat = cusparseSpMV(cusparsestruct->handle, opA, 2332 matstruct->alpha_one, 2333 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */ 2334 matstruct->cuSpMV[opA].vecXDescr, 2335 beta, 2336 matstruct->cuSpMV[opA].vecYDescr, 2337 cusparse_scalartype, 2338 cusparsestruct->spmvAlg, 2339 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2340 #else 2341 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2342 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2343 mat->num_rows, mat->num_cols, 2344 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2345 mat->values->data().get(), mat->row_offsets->data().get(), 2346 mat->column_indices->data().get(), xptr, beta, 2347 dptr);CHKERRCUSPARSE(stat); 2348 #endif 2349 } else { 2350 if (cusparsestruct->nrows) { 2351 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2352 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2353 #else 2354 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2355 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2356 matstruct->alpha_one, matstruct->descr, hybMat, 2357 xptr, beta, 2358 dptr);CHKERRCUSPARSE(stat); 2359 #endif 2360 } 2361 } 2362 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2363 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2364 2365 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2366 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2367 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2368 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2369 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2370 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2371 } 2372 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2373 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2374 } 2375 2376 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2377 if (compressed) { 2378 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2379 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2380 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2381 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2382 VecCUDAPlusEquals()); 2383 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2384 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2385 } 2386 } else { 2387 if (yy && yy != zz) { 2388 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2389 } 2390 } 2391 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2392 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2393 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2394 } catch(char *ex) { 2395 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2396 } 2397 if (yy) { 2398 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2399 } else { 2400 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2401 } 2402 PetscFunctionReturn(0); 2403 } 2404 2405 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2406 { 2407 PetscErrorCode ierr; 2408 2409 PetscFunctionBegin; 2410 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2411 PetscFunctionReturn(0); 2412 } 2413 2414 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2415 { 2416 PetscErrorCode ierr; 2417 PetscSplitCSRDataStructure *d_mat = NULL; 2418 PetscFunctionBegin; 2419 if (A->factortype == MAT_FACTOR_NONE) { 2420 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2421 } 2422 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2423 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2424 if (d_mat) { 2425 A->offloadmask = PETSC_OFFLOAD_GPU; 2426 } 2427 2428 PetscFunctionReturn(0); 2429 } 2430 2431 /* --------------------------------------------------------------------------------*/ 2432 /*@ 2433 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2434 (the default parallel PETSc format). This matrix will ultimately pushed down 2435 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2436 assembly performance the user should preallocate the matrix storage by setting 2437 the parameter nz (or the array nnz). By setting these parameters accurately, 2438 performance during matrix assembly can be increased by more than a factor of 50. 2439 2440 Collective 2441 2442 Input Parameters: 2443 + comm - MPI communicator, set to PETSC_COMM_SELF 2444 . m - number of rows 2445 . n - number of columns 2446 . nz - number of nonzeros per row (same for all rows) 2447 - nnz - array containing the number of nonzeros in the various rows 2448 (possibly different for each row) or NULL 2449 2450 Output Parameter: 2451 . A - the matrix 2452 2453 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2454 MatXXXXSetPreallocation() paradgm instead of this routine directly. 2455 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 2456 2457 Notes: 2458 If nnz is given then nz is ignored 2459 2460 The AIJ format (also called the Yale sparse matrix format or 2461 compressed row storage), is fully compatible with standard Fortran 77 2462 storage. That is, the stored row and column indices can begin at 2463 either one (as in Fortran) or zero. See the users' manual for details. 2464 2465 Specify the preallocated storage with either nz or nnz (not both). 2466 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 2467 allocation. For large problems you MUST preallocate memory or you 2468 will get TERRIBLE performance, see the users' manual chapter on matrices. 2469 2470 By default, this format uses inodes (identical nodes) when possible, to 2471 improve numerical efficiency of matrix-vector products and solves. We 2472 search for consecutive rows with the same nonzero structure, thereby 2473 reusing matrix information to achieve increased efficiency. 2474 2475 Level: intermediate 2476 2477 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 2478 @*/ 2479 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 2480 { 2481 PetscErrorCode ierr; 2482 2483 PetscFunctionBegin; 2484 ierr = MatCreate(comm,A);CHKERRQ(ierr); 2485 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 2486 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2487 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 2488 PetscFunctionReturn(0); 2489 } 2490 2491 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 2492 { 2493 PetscErrorCode ierr; 2494 PetscSplitCSRDataStructure *d_mat = NULL; 2495 2496 PetscFunctionBegin; 2497 if (A->factortype == MAT_FACTOR_NONE) { 2498 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2499 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 2500 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 2501 } else { 2502 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 2503 } 2504 if (d_mat) { 2505 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2506 cudaError_t err; 2507 PetscSplitCSRDataStructure h_mat; 2508 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 2509 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2510 if (a->compressedrow.use) { 2511 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 2512 } 2513 err = cudaFree(d_mat);CHKERRCUDA(err); 2514 } 2515 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 2516 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2517 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 2518 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 2519 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 2520 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 2521 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 2522 PetscFunctionReturn(0); 2523 } 2524 2525 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 2526 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 2527 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 2528 { 2529 PetscErrorCode ierr; 2530 2531 PetscFunctionBegin; 2532 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 2533 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 2534 PetscFunctionReturn(0); 2535 } 2536 2537 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) // put axpy in aijcusparse, etc. 2538 { 2539 PetscErrorCode ierr; 2540 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 2541 PetscBool flgx,flgy; 2542 2543 PetscFunctionBegin; 2544 if (a == (PetscScalar)0.0) PetscFunctionReturn(0); 2545 PetscValidHeaderSpecific(Y,MAT_CLASSID,1); 2546 PetscValidHeaderSpecific(X,MAT_CLASSID,3); 2547 ierr = PetscObjectTypeCompare((PetscObject)Y,MATSEQAIJCUSPARSE,&flgy);CHKERRQ(ierr); 2548 ierr = PetscObjectTypeCompare((PetscObject)X,MATSEQAIJCUSPARSE,&flgx);CHKERRQ(ierr); 2549 if (!flgx || !flgy) { 2550 ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr); 2551 PetscFunctionReturn(0); 2552 } 2553 if (Y->factortype != MAT_FACTOR_NONE || X->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"both matrices must be MAT_FACTOR_NONE"); 2554 if (str == DIFFERENT_NONZERO_PATTERN) { 2555 if (x->nz == y->nz) { 2556 PetscBool e; 2557 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 2558 if (e) { 2559 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 2560 if (e) { 2561 str = SAME_NONZERO_PATTERN; 2562 } 2563 } 2564 } 2565 } 2566 if (str != SAME_NONZERO_PATTERN) { 2567 ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr); 2568 PetscFunctionReturn(0); 2569 } else { 2570 Mat_SeqAIJCUSPARSE *cusparsestruct_y = (Mat_SeqAIJCUSPARSE*)Y->spptr; 2571 Mat_SeqAIJCUSPARSE *cusparsestruct_x = (Mat_SeqAIJCUSPARSE*)X->spptr; 2572 if (cusparsestruct_y->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 2573 if (cusparsestruct_x->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 2574 if (!cusparsestruct_y->mat || !cusparsestruct_x->mat) { 2575 if (Y->offloadmask == PETSC_OFFLOAD_UNALLOCATED || Y->offloadmask == PETSC_OFFLOAD_GPU) { 2576 ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr); 2577 } 2578 if (X->offloadmask == PETSC_OFFLOAD_UNALLOCATED || X->offloadmask == PETSC_OFFLOAD_GPU) { 2579 ierr = MatSeqAIJCUSPARSECopyFromGPU(X);CHKERRQ(ierr); 2580 } 2581 ierr = MatAXPY_SeqAIJ(Y,a,X,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 2582 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 2583 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 2584 } else { 2585 cublasHandle_t cublasv2handle; 2586 cublasStatus_t cberr; 2587 cudaError_t err; 2588 PetscScalar alpha = a; 2589 PetscBLASInt one = 1, bnz = 1; 2590 CsrMatrix *matrix_y = (CsrMatrix*)cusparsestruct_y->mat->mat; 2591 CsrMatrix *matrix_x = (CsrMatrix*)cusparsestruct_x->mat->mat; 2592 PetscScalar *aa_y, *aa_x; 2593 aa_y = matrix_y->values->data().get(); 2594 aa_x = matrix_x->values->data().get(); 2595 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2596 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 2597 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2598 cberr = cublasXaxpy(cublasv2handle,bnz,&alpha,aa_x,one,aa_y,one);CHKERRCUBLAS(cberr); 2599 err = WaitForCUDA();CHKERRCUDA(err); 2600 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 2601 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2602 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 2603 ierr = PetscObjectStateIncrease((PetscObject)Y);CHKERRQ(ierr); 2604 if (Y->offloadmask == PETSC_OFFLOAD_BOTH) Y->offloadmask = PETSC_OFFLOAD_GPU; 2605 else if (Y->offloadmask != PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"wrong state"); 2606 ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr); 2607 } 2608 } 2609 PetscFunctionReturn(0); 2610 } 2611 2612 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 2613 { 2614 PetscErrorCode ierr; 2615 PetscBool both = PETSC_FALSE; 2616 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2617 2618 PetscFunctionBegin; 2619 if (A->factortype == MAT_FACTOR_NONE) { 2620 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 2621 if (spptr->mat) { 2622 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 2623 if (matrix->values) { 2624 both = PETSC_TRUE; 2625 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 2626 } 2627 } 2628 if (spptr->matTranspose) { 2629 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 2630 if (matrix->values) { 2631 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 2632 } 2633 } 2634 } 2635 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 2636 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 2637 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 2638 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2639 else A->offloadmask = PETSC_OFFLOAD_CPU; 2640 2641 PetscFunctionReturn(0); 2642 } 2643 2644 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 2645 { 2646 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2647 PetscErrorCode ierr; 2648 2649 PetscFunctionBegin; 2650 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 2651 if (flg) { 2652 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 2653 2654 A->ops->axpy = MatAXPY_SeqAIJ; 2655 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 2656 A->ops->mult = MatMult_SeqAIJ; 2657 A->ops->multadd = MatMultAdd_SeqAIJ; 2658 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 2659 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 2660 A->ops->multhermitiantranspose = NULL; 2661 A->ops->multhermitiantransposeadd = NULL; 2662 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2663 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 2664 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 2665 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 2666 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 2667 } else { 2668 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 2669 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 2670 A->ops->mult = MatMult_SeqAIJCUSPARSE; 2671 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 2672 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 2673 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 2674 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 2675 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 2676 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2677 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2678 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 2679 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 2680 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 2681 } 2682 A->boundtocpu = flg; 2683 a->inode.use = flg; 2684 PetscFunctionReturn(0); 2685 } 2686 2687 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 2688 { 2689 PetscErrorCode ierr; 2690 cusparseStatus_t stat; 2691 Mat B; 2692 2693 PetscFunctionBegin; 2694 if (reuse == MAT_INITIAL_MATRIX) { 2695 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 2696 } else if (reuse == MAT_REUSE_MATRIX) { 2697 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 2698 } 2699 B = *newmat; 2700 2701 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 2702 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 2703 2704 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 2705 if (B->factortype == MAT_FACTOR_NONE) { 2706 Mat_SeqAIJCUSPARSE *spptr; 2707 2708 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2709 spptr->format = MAT_CUSPARSE_CSR; 2710 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2711 B->spptr = spptr; 2712 spptr->deviceMat = NULL; 2713 } else { 2714 Mat_SeqAIJCUSPARSETriFactors *spptr; 2715 2716 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2717 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2718 B->spptr = spptr; 2719 } 2720 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2721 } 2722 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 2723 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 2724 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 2725 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 2726 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 2727 2728 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 2729 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2730 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 2731 PetscFunctionReturn(0); 2732 } 2733 2734 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 2735 { 2736 PetscErrorCode ierr; 2737 2738 PetscFunctionBegin; 2739 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 2740 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 2741 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2742 ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr); 2743 ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr); 2744 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2745 PetscFunctionReturn(0); 2746 } 2747 2748 /*MC 2749 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 2750 2751 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 2752 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 2753 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 2754 2755 Options Database Keys: 2756 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 2757 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2758 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2759 2760 Level: beginner 2761 2762 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 2763 M*/ 2764 2765 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 2766 2767 2768 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 2769 { 2770 PetscErrorCode ierr; 2771 2772 PetscFunctionBegin; 2773 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2774 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2775 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2776 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2777 PetscFunctionReturn(0); 2778 } 2779 2780 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 2781 { 2782 PetscErrorCode ierr; 2783 cusparseStatus_t stat; 2784 2785 PetscFunctionBegin; 2786 if (*cusparsestruct) { 2787 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 2788 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 2789 delete (*cusparsestruct)->workVector; 2790 delete (*cusparsestruct)->rowoffsets_gpu; 2791 delete (*cusparsestruct)->cooPerm; 2792 delete (*cusparsestruct)->cooPerm_a; 2793 delete (*cusparsestruct)->cooPerm_v; 2794 delete (*cusparsestruct)->cooPerm_w; 2795 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 2796 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2797 cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr); 2798 #endif 2799 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 2800 } 2801 PetscFunctionReturn(0); 2802 } 2803 2804 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 2805 { 2806 PetscFunctionBegin; 2807 if (*mat) { 2808 delete (*mat)->values; 2809 delete (*mat)->column_indices; 2810 delete (*mat)->row_offsets; 2811 delete *mat; 2812 *mat = 0; 2813 } 2814 PetscFunctionReturn(0); 2815 } 2816 2817 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 2818 { 2819 cusparseStatus_t stat; 2820 PetscErrorCode ierr; 2821 2822 PetscFunctionBegin; 2823 if (*trifactor) { 2824 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 2825 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 2826 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 2827 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 2828 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2829 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 2830 #endif 2831 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 2832 } 2833 PetscFunctionReturn(0); 2834 } 2835 2836 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 2837 { 2838 CsrMatrix *mat; 2839 cusparseStatus_t stat; 2840 cudaError_t err; 2841 2842 PetscFunctionBegin; 2843 if (*matstruct) { 2844 if ((*matstruct)->mat) { 2845 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 2846 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2847 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2848 #else 2849 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 2850 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 2851 #endif 2852 } else { 2853 mat = (CsrMatrix*)(*matstruct)->mat; 2854 CsrMatrix_Destroy(&mat); 2855 } 2856 } 2857 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 2858 delete (*matstruct)->cprowIndices; 2859 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 2860 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 2861 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 2862 2863 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2864 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 2865 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 2866 for (int i=0; i<3; i++) { 2867 if (mdata->cuSpMV[i].initialized) { 2868 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 2869 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 2870 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 2871 } 2872 } 2873 #endif 2874 delete *matstruct; 2875 *matstruct = NULL; 2876 } 2877 PetscFunctionReturn(0); 2878 } 2879 2880 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2881 { 2882 PetscErrorCode ierr; 2883 2884 PetscFunctionBegin; 2885 if (*trifactors) { 2886 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 2887 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 2888 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 2889 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 2890 delete (*trifactors)->rpermIndices; 2891 delete (*trifactors)->cpermIndices; 2892 delete (*trifactors)->workVector; 2893 (*trifactors)->rpermIndices = NULL; 2894 (*trifactors)->cpermIndices = NULL; 2895 (*trifactors)->workVector = NULL; 2896 } 2897 PetscFunctionReturn(0); 2898 } 2899 2900 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2901 { 2902 PetscErrorCode ierr; 2903 cusparseHandle_t handle; 2904 cusparseStatus_t stat; 2905 2906 PetscFunctionBegin; 2907 if (*trifactors) { 2908 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 2909 if (handle = (*trifactors)->handle) { 2910 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 2911 } 2912 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 2913 } 2914 PetscFunctionReturn(0); 2915 } 2916 2917 struct IJCompare 2918 { 2919 __host__ __device__ 2920 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 2921 { 2922 if (t1.get<0>() < t2.get<0>()) return true; 2923 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 2924 return false; 2925 } 2926 }; 2927 2928 struct IJEqual 2929 { 2930 __host__ __device__ 2931 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 2932 { 2933 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 2934 return true; 2935 } 2936 }; 2937 2938 struct IJDiff 2939 { 2940 __host__ __device__ 2941 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 2942 { 2943 return t1 == t2 ? 0 : 1; 2944 } 2945 }; 2946 2947 struct IJSum 2948 { 2949 __host__ __device__ 2950 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 2951 { 2952 return t1||t2; 2953 } 2954 }; 2955 2956 #include <thrust/iterator/discard_iterator.h> 2957 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 2958 { 2959 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2960 CsrMatrix *matrix; 2961 PetscErrorCode ierr; 2962 cudaError_t cerr; 2963 PetscInt n; 2964 2965 PetscFunctionBegin; 2966 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 2967 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 2968 if (!cusp->cooPerm) { 2969 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2970 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2971 PetscFunctionReturn(0); 2972 } 2973 n = cusp->cooPerm->size(); 2974 matrix = (CsrMatrix*)cusp->mat->mat; 2975 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 2976 if (!cusp->cooPerm_v) { cusp->cooPerm_v = new THRUSTARRAY(n); } 2977 ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 2978 if (v) { 2979 cusp->cooPerm_v->assign(v,v+n); 2980 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 2981 } 2982 else thrust::fill(thrust::device,cusp->cooPerm_v->begin(),cusp->cooPerm_v->end(),0.); 2983 if (imode == ADD_VALUES) { 2984 if (cusp->cooPerm_a) { 2985 if (!cusp->cooPerm_w) cusp->cooPerm_w = new THRUSTARRAY(matrix->values->size()); 2986 auto vbit = thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()); 2987 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cusp->cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 2988 thrust::transform(cusp->cooPerm_w->begin(),cusp->cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 2989 } else { 2990 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 2991 matrix->values->begin())); 2992 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 2993 matrix->values->end())); 2994 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 2995 } 2996 } else { 2997 if (cusp->cooPerm_a) { /* non unique values insertion, result is undefined (we cannot guarantee last takes precedence) 2998 if we are inserting two different values into the same location */ 2999 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 3000 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->begin()))); 3001 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 3002 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->end()))); 3003 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3004 } else { 3005 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 3006 matrix->values->begin())); 3007 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 3008 matrix->values->end())); 3009 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3010 } 3011 } 3012 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3013 ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 3014 A->offloadmask = PETSC_OFFLOAD_GPU; 3015 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3016 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3017 /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */ 3018 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3019 PetscFunctionReturn(0); 3020 } 3021 3022 #include <thrust/binary_search.h> 3023 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3024 { 3025 PetscErrorCode ierr; 3026 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3027 CsrMatrix *matrix; 3028 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3029 PetscInt cooPerm_n, nzr = 0; 3030 cudaError_t cerr; 3031 3032 PetscFunctionBegin; 3033 ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr); 3034 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3035 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3036 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3037 if (n != cooPerm_n) { 3038 delete cusp->cooPerm; 3039 delete cusp->cooPerm_v; 3040 delete cusp->cooPerm_w; 3041 delete cusp->cooPerm_a; 3042 cusp->cooPerm = NULL; 3043 cusp->cooPerm_v = NULL; 3044 cusp->cooPerm_w = NULL; 3045 cusp->cooPerm_a = NULL; 3046 } 3047 if (n) { 3048 THRUSTINTARRAY d_i(n); 3049 THRUSTINTARRAY d_j(n); 3050 THRUSTINTARRAY ii(A->rmap->n); 3051 3052 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3053 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3054 3055 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3056 d_i.assign(coo_i,coo_i+n); 3057 d_j.assign(coo_j,coo_j+n); 3058 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3059 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3060 3061 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3062 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 3063 *cusp->cooPerm_a = d_i; 3064 THRUSTINTARRAY w = d_j; 3065 3066 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 3067 if (nekey == ekey) { /* all entries are unique */ 3068 delete cusp->cooPerm_a; 3069 cusp->cooPerm_a = NULL; 3070 } else { /* I couldn't come up with a more elegant algorithm */ 3071 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 3072 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 3073 (*cusp->cooPerm_a)[0] = 0; 3074 w[0] = 0; 3075 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 3076 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 3077 } 3078 thrust::counting_iterator<PetscInt> search_begin(0); 3079 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 3080 search_begin, search_begin + A->rmap->n, 3081 ii.begin()); 3082 3083 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3084 a->singlemalloc = PETSC_FALSE; 3085 a->free_a = PETSC_TRUE; 3086 a->free_ij = PETSC_TRUE; 3087 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3088 a->i[0] = 0; 3089 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3090 a->nz = a->maxnz = a->i[A->rmap->n]; 3091 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3092 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3093 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3094 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3095 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3096 for (PetscInt i = 0; i < A->rmap->n; i++) { 3097 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3098 nzr += (PetscInt)!!(nnzr); 3099 a->ilen[i] = a->imax[i] = nnzr; 3100 } 3101 A->preallocated = PETSC_TRUE; 3102 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3103 } else { 3104 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3105 } 3106 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3107 ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr); 3108 3109 /* We want to allocate the CUSPARSE struct for matvec now. 3110 The code is so convoluted now that I prefer to copy garbage to the GPU */ 3111 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3112 A->offloadmask = PETSC_OFFLOAD_CPU; 3113 A->nonzerostate++; 3114 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3115 { 3116 matrix = (CsrMatrix*)cusp->mat->mat; 3117 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3118 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3119 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3120 } 3121 3122 A->offloadmask = PETSC_OFFLOAD_CPU; 3123 A->assembled = PETSC_FALSE; 3124 A->was_assembled = PETSC_FALSE; 3125 PetscFunctionReturn(0); 3126 } 3127