1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 68 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 69 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 70 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 71 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 72 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 73 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 74 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 75 76 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 82 83 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 84 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 85 86 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 87 { 88 cusparseStatus_t stat; 89 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 90 91 PetscFunctionBegin; 92 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 93 cusparsestruct->stream = stream; 94 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 95 PetscFunctionReturn(0); 96 } 97 98 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 99 { 100 cusparseStatus_t stat; 101 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 102 103 PetscFunctionBegin; 104 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 105 if (cusparsestruct->handle != handle) { 106 if (cusparsestruct->handle) { 107 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 108 } 109 cusparsestruct->handle = handle; 110 } 111 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 112 PetscFunctionReturn(0); 113 } 114 115 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 116 { 117 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 118 PetscBool flg; 119 PetscErrorCode ierr; 120 121 PetscFunctionBegin; 122 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 123 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 124 if (cusparsestruct->handle) cusparsestruct->handle = 0; 125 PetscFunctionReturn(0); 126 } 127 128 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 129 { 130 PetscFunctionBegin; 131 *type = MATSOLVERCUSPARSE; 132 PetscFunctionReturn(0); 133 } 134 135 /*MC 136 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 137 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 138 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 139 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 140 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 141 algorithms are not recommended. This class does NOT support direct solver operations. 142 143 Level: beginner 144 145 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 146 M*/ 147 148 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 149 { 150 PetscErrorCode ierr; 151 PetscInt n = A->rmap->n; 152 153 PetscFunctionBegin; 154 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 155 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 156 (*B)->factortype = ftype; 157 (*B)->useordering = PETSC_TRUE; 158 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 159 160 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 161 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 162 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 163 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 164 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 165 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 166 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 167 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 168 169 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 170 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 171 PetscFunctionReturn(0); 172 } 173 174 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 175 { 176 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 177 178 PetscFunctionBegin; 179 switch (op) { 180 case MAT_CUSPARSE_MULT: 181 cusparsestruct->format = format; 182 break; 183 case MAT_CUSPARSE_ALL: 184 cusparsestruct->format = format; 185 break; 186 default: 187 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 188 } 189 PetscFunctionReturn(0); 190 } 191 192 /*@ 193 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 194 operation. Only the MatMult operation can use different GPU storage formats 195 for MPIAIJCUSPARSE matrices. 196 Not Collective 197 198 Input Parameters: 199 + A - Matrix of type SEQAIJCUSPARSE 200 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 201 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 202 203 Output Parameter: 204 205 Level: intermediate 206 207 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 208 @*/ 209 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 210 { 211 PetscErrorCode ierr; 212 213 PetscFunctionBegin; 214 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 215 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 216 PetscFunctionReturn(0); 217 } 218 219 /*@ 220 MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose 221 222 Collective on mat 223 224 Input Parameters: 225 + A - Matrix of type SEQAIJCUSPARSE 226 - transgen - the boolean flag 227 228 Level: intermediate 229 230 .seealso: MATSEQAIJCUSPARSE 231 @*/ 232 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 233 { 234 PetscErrorCode ierr; 235 PetscBool flg; 236 237 PetscFunctionBegin; 238 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 239 ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 240 if (flg) { 241 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 242 243 if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 244 cusp->transgen = transgen; 245 if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 246 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 247 } 248 } 249 PetscFunctionReturn(0); 250 } 251 252 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 253 { 254 PetscErrorCode ierr; 255 MatCUSPARSEStorageFormat format; 256 PetscBool flg; 257 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 258 259 PetscFunctionBegin; 260 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 261 if (A->factortype == MAT_FACTOR_NONE) { 262 PetscBool transgen = cusparsestruct->transgen; 263 264 ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 265 if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);} 266 267 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 268 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 269 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 270 271 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 272 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 273 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 274 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 275 cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 276 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 277 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 278 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 279 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 280 281 cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 282 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 283 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 284 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 285 286 cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 287 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 288 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 289 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 290 #endif 291 } 292 ierr = PetscOptionsTail();CHKERRQ(ierr); 293 PetscFunctionReturn(0); 294 } 295 296 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 297 { 298 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 299 PetscErrorCode ierr; 300 301 PetscFunctionBegin; 302 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 303 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 304 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 305 PetscFunctionReturn(0); 306 } 307 308 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 309 { 310 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 311 PetscErrorCode ierr; 312 313 PetscFunctionBegin; 314 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 315 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 316 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 317 PetscFunctionReturn(0); 318 } 319 320 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 321 { 322 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 323 PetscErrorCode ierr; 324 325 PetscFunctionBegin; 326 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 327 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 328 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 329 PetscFunctionReturn(0); 330 } 331 332 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 333 { 334 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 335 PetscErrorCode ierr; 336 337 PetscFunctionBegin; 338 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 339 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 340 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 341 PetscFunctionReturn(0); 342 } 343 344 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 345 { 346 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 347 PetscInt n = A->rmap->n; 348 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 349 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 350 cusparseStatus_t stat; 351 const PetscInt *ai = a->i,*aj = a->j,*vi; 352 const MatScalar *aa = a->a,*v; 353 PetscInt *AiLo, *AjLo; 354 PetscInt i,nz, nzLower, offset, rowOffset; 355 PetscErrorCode ierr; 356 cudaError_t cerr; 357 358 PetscFunctionBegin; 359 if (!n) PetscFunctionReturn(0); 360 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 361 try { 362 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 363 nzLower=n+ai[n]-ai[1]; 364 if (!loTriFactor) { 365 PetscScalar *AALo; 366 367 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 368 369 /* Allocate Space for the lower triangular matrix */ 370 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 371 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 372 373 /* Fill the lower triangular matrix */ 374 AiLo[0] = (PetscInt) 0; 375 AiLo[n] = nzLower; 376 AjLo[0] = (PetscInt) 0; 377 AALo[0] = (MatScalar) 1.0; 378 v = aa; 379 vi = aj; 380 offset = 1; 381 rowOffset= 1; 382 for (i=1; i<n; i++) { 383 nz = ai[i+1] - ai[i]; 384 /* additional 1 for the term on the diagonal */ 385 AiLo[i] = rowOffset; 386 rowOffset += nz+1; 387 388 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 389 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 390 391 offset += nz; 392 AjLo[offset] = (PetscInt) i; 393 AALo[offset] = (MatScalar) 1.0; 394 offset += 1; 395 396 v += nz; 397 vi += nz; 398 } 399 400 /* allocate space for the triangular factor information */ 401 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 402 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 403 /* Create the matrix description */ 404 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 405 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 406 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 407 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 408 #else 409 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 410 #endif 411 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 412 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 413 414 /* set the operation */ 415 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 416 417 /* set the matrix */ 418 loTriFactor->csrMat = new CsrMatrix; 419 loTriFactor->csrMat->num_rows = n; 420 loTriFactor->csrMat->num_cols = n; 421 loTriFactor->csrMat->num_entries = nzLower; 422 423 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 424 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 425 426 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 427 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 428 429 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 430 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 431 432 /* Create the solve analysis information */ 433 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 434 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 435 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 436 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 437 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 438 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 439 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 440 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 441 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 442 #endif 443 444 /* perform the solve analysis */ 445 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 446 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 447 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 448 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 449 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 450 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 451 #endif 452 );CHKERRCUSPARSE(stat); 453 cerr = WaitForCUDA();CHKERRCUDA(cerr); 454 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 455 456 /* assign the pointer */ 457 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 458 loTriFactor->AA_h = AALo; 459 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 460 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 461 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 462 } else { /* update values only */ 463 if (!loTriFactor->AA_h) { 464 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 465 } 466 /* Fill the lower triangular matrix */ 467 loTriFactor->AA_h[0] = 1.0; 468 v = aa; 469 vi = aj; 470 offset = 1; 471 for (i=1; i<n; i++) { 472 nz = ai[i+1] - ai[i]; 473 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 474 offset += nz; 475 loTriFactor->AA_h[offset] = 1.0; 476 offset += 1; 477 v += nz; 478 } 479 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 480 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 481 } 482 } catch(char *ex) { 483 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 484 } 485 } 486 PetscFunctionReturn(0); 487 } 488 489 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 490 { 491 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 492 PetscInt n = A->rmap->n; 493 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 494 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 495 cusparseStatus_t stat; 496 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 497 const MatScalar *aa = a->a,*v; 498 PetscInt *AiUp, *AjUp; 499 PetscInt i,nz, nzUpper, offset; 500 PetscErrorCode ierr; 501 cudaError_t cerr; 502 503 PetscFunctionBegin; 504 if (!n) PetscFunctionReturn(0); 505 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 506 try { 507 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 508 nzUpper = adiag[0]-adiag[n]; 509 if (!upTriFactor) { 510 PetscScalar *AAUp; 511 512 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 513 514 /* Allocate Space for the upper triangular matrix */ 515 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 516 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 517 518 /* Fill the upper triangular matrix */ 519 AiUp[0]=(PetscInt) 0; 520 AiUp[n]=nzUpper; 521 offset = nzUpper; 522 for (i=n-1; i>=0; i--) { 523 v = aa + adiag[i+1] + 1; 524 vi = aj + adiag[i+1] + 1; 525 526 /* number of elements NOT on the diagonal */ 527 nz = adiag[i] - adiag[i+1]-1; 528 529 /* decrement the offset */ 530 offset -= (nz+1); 531 532 /* first, set the diagonal elements */ 533 AjUp[offset] = (PetscInt) i; 534 AAUp[offset] = (MatScalar)1./v[nz]; 535 AiUp[i] = AiUp[i+1] - (nz+1); 536 537 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 538 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 539 } 540 541 /* allocate space for the triangular factor information */ 542 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 543 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 544 545 /* Create the matrix description */ 546 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 547 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 548 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 549 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 550 #else 551 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 552 #endif 553 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 554 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 555 556 /* set the operation */ 557 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 558 559 /* set the matrix */ 560 upTriFactor->csrMat = new CsrMatrix; 561 upTriFactor->csrMat->num_rows = n; 562 upTriFactor->csrMat->num_cols = n; 563 upTriFactor->csrMat->num_entries = nzUpper; 564 565 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 566 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 567 568 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 569 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 570 571 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 572 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 573 574 /* Create the solve analysis information */ 575 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 576 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 577 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 578 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 579 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 580 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 581 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 582 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 583 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 584 #endif 585 586 /* perform the solve analysis */ 587 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 588 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 589 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 590 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 591 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 592 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 593 #endif 594 );CHKERRCUSPARSE(stat); 595 cerr = WaitForCUDA();CHKERRCUDA(cerr); 596 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 597 598 /* assign the pointer */ 599 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 600 upTriFactor->AA_h = AAUp; 601 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 602 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 603 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 604 } else { 605 if (!upTriFactor->AA_h) { 606 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 607 } 608 /* Fill the upper triangular matrix */ 609 offset = nzUpper; 610 for (i=n-1; i>=0; i--) { 611 v = aa + adiag[i+1] + 1; 612 613 /* number of elements NOT on the diagonal */ 614 nz = adiag[i] - adiag[i+1]-1; 615 616 /* decrement the offset */ 617 offset -= (nz+1); 618 619 /* first, set the diagonal elements */ 620 upTriFactor->AA_h[offset] = 1./v[nz]; 621 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 622 } 623 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 624 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 625 } 626 } catch(char *ex) { 627 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 628 } 629 } 630 PetscFunctionReturn(0); 631 } 632 633 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 634 { 635 PetscErrorCode ierr; 636 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 637 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 638 IS isrow = a->row,iscol = a->icol; 639 PetscBool row_identity,col_identity; 640 PetscInt n = A->rmap->n; 641 642 PetscFunctionBegin; 643 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 644 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 645 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 646 647 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 648 cusparseTriFactors->nnz=a->nz; 649 650 A->offloadmask = PETSC_OFFLOAD_BOTH; 651 /* lower triangular indices */ 652 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 653 if (!row_identity && !cusparseTriFactors->rpermIndices) { 654 const PetscInt *r; 655 656 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 657 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 658 cusparseTriFactors->rpermIndices->assign(r, r+n); 659 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 660 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 661 } 662 663 /* upper triangular indices */ 664 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 665 if (!col_identity && !cusparseTriFactors->cpermIndices) { 666 const PetscInt *c; 667 668 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 669 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 670 cusparseTriFactors->cpermIndices->assign(c, c+n); 671 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 672 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 673 } 674 PetscFunctionReturn(0); 675 } 676 677 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 678 { 679 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 680 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 681 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 682 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 683 cusparseStatus_t stat; 684 PetscErrorCode ierr; 685 cudaError_t cerr; 686 PetscInt *AiUp, *AjUp; 687 PetscScalar *AAUp; 688 PetscScalar *AALo; 689 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 690 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 691 const PetscInt *ai = b->i,*aj = b->j,*vj; 692 const MatScalar *aa = b->a,*v; 693 694 PetscFunctionBegin; 695 if (!n) PetscFunctionReturn(0); 696 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 697 try { 698 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 699 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 700 if (!upTriFactor && !loTriFactor) { 701 /* Allocate Space for the upper triangular matrix */ 702 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 703 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 704 705 /* Fill the upper triangular matrix */ 706 AiUp[0]=(PetscInt) 0; 707 AiUp[n]=nzUpper; 708 offset = 0; 709 for (i=0; i<n; i++) { 710 /* set the pointers */ 711 v = aa + ai[i]; 712 vj = aj + ai[i]; 713 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 714 715 /* first, set the diagonal elements */ 716 AjUp[offset] = (PetscInt) i; 717 AAUp[offset] = (MatScalar)1.0/v[nz]; 718 AiUp[i] = offset; 719 AALo[offset] = (MatScalar)1.0/v[nz]; 720 721 offset+=1; 722 if (nz>0) { 723 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 724 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 725 for (j=offset; j<offset+nz; j++) { 726 AAUp[j] = -AAUp[j]; 727 AALo[j] = AAUp[j]/v[nz]; 728 } 729 offset+=nz; 730 } 731 } 732 733 /* allocate space for the triangular factor information */ 734 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 735 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 736 737 /* Create the matrix description */ 738 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 739 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 740 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 741 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 742 #else 743 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 744 #endif 745 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 746 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 747 748 /* set the matrix */ 749 upTriFactor->csrMat = new CsrMatrix; 750 upTriFactor->csrMat->num_rows = A->rmap->n; 751 upTriFactor->csrMat->num_cols = A->cmap->n; 752 upTriFactor->csrMat->num_entries = a->nz; 753 754 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 755 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 756 757 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 758 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 759 760 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 761 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 762 763 /* set the operation */ 764 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 765 766 /* Create the solve analysis information */ 767 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 768 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 769 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 770 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 771 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 772 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 773 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 774 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 775 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 776 #endif 777 778 /* perform the solve analysis */ 779 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 780 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 781 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 782 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 783 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 784 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 785 #endif 786 );CHKERRCUSPARSE(stat); 787 cerr = WaitForCUDA();CHKERRCUDA(cerr); 788 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 789 790 /* assign the pointer */ 791 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 792 793 /* allocate space for the triangular factor information */ 794 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 795 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 796 797 /* Create the matrix description */ 798 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 799 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 800 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 801 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 802 #else 803 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 804 #endif 805 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 806 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 807 808 /* set the operation */ 809 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 810 811 /* set the matrix */ 812 loTriFactor->csrMat = new CsrMatrix; 813 loTriFactor->csrMat->num_rows = A->rmap->n; 814 loTriFactor->csrMat->num_cols = A->cmap->n; 815 loTriFactor->csrMat->num_entries = a->nz; 816 817 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 818 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 819 820 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 821 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 822 823 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 824 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 825 826 /* Create the solve analysis information */ 827 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 828 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 829 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 830 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 831 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 832 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 833 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 834 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 835 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 836 #endif 837 838 /* perform the solve analysis */ 839 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 840 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 841 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 842 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 843 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 844 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 845 #endif 846 );CHKERRCUSPARSE(stat); 847 cerr = WaitForCUDA();CHKERRCUDA(cerr); 848 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 849 850 /* assign the pointer */ 851 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 852 853 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 854 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 855 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 856 } else { 857 /* Fill the upper triangular matrix */ 858 offset = 0; 859 for (i=0; i<n; i++) { 860 /* set the pointers */ 861 v = aa + ai[i]; 862 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 863 864 /* first, set the diagonal elements */ 865 AAUp[offset] = 1.0/v[nz]; 866 AALo[offset] = 1.0/v[nz]; 867 868 offset+=1; 869 if (nz>0) { 870 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 871 for (j=offset; j<offset+nz; j++) { 872 AAUp[j] = -AAUp[j]; 873 AALo[j] = AAUp[j]/v[nz]; 874 } 875 offset+=nz; 876 } 877 } 878 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 879 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 880 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 881 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 882 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 883 } 884 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 885 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 886 } catch(char *ex) { 887 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 888 } 889 } 890 PetscFunctionReturn(0); 891 } 892 893 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 894 { 895 PetscErrorCode ierr; 896 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 897 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 898 IS ip = a->row; 899 PetscBool perm_identity; 900 PetscInt n = A->rmap->n; 901 902 PetscFunctionBegin; 903 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 904 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 905 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 906 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 907 908 A->offloadmask = PETSC_OFFLOAD_BOTH; 909 910 /* lower triangular indices */ 911 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 912 if (!perm_identity) { 913 IS iip; 914 const PetscInt *irip,*rip; 915 916 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 917 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 918 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 919 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 920 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 921 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 922 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 923 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 924 ierr = ISDestroy(&iip);CHKERRQ(ierr); 925 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 926 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 927 } 928 PetscFunctionReturn(0); 929 } 930 931 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 932 { 933 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 934 IS isrow = b->row,iscol = b->col; 935 PetscBool row_identity,col_identity; 936 PetscErrorCode ierr; 937 938 PetscFunctionBegin; 939 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 940 B->offloadmask = PETSC_OFFLOAD_CPU; 941 /* determine which version of MatSolve needs to be used. */ 942 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 943 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 944 if (row_identity && col_identity) { 945 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 946 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 947 B->ops->matsolve = NULL; 948 B->ops->matsolvetranspose = NULL; 949 } else { 950 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 951 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 952 B->ops->matsolve = NULL; 953 B->ops->matsolvetranspose = NULL; 954 } 955 956 /* get the triangular factors */ 957 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 958 PetscFunctionReturn(0); 959 } 960 961 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 962 { 963 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 964 IS ip = b->row; 965 PetscBool perm_identity; 966 PetscErrorCode ierr; 967 968 PetscFunctionBegin; 969 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 970 B->offloadmask = PETSC_OFFLOAD_CPU; 971 /* determine which version of MatSolve needs to be used. */ 972 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 973 if (perm_identity) { 974 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 975 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 976 B->ops->matsolve = NULL; 977 B->ops->matsolvetranspose = NULL; 978 } else { 979 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 980 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 981 B->ops->matsolve = NULL; 982 B->ops->matsolvetranspose = NULL; 983 } 984 985 /* get the triangular factors */ 986 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 987 PetscFunctionReturn(0); 988 } 989 990 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 991 { 992 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 993 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 994 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 995 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 996 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 997 cusparseStatus_t stat; 998 cusparseIndexBase_t indexBase; 999 cusparseMatrixType_t matrixType; 1000 cusparseFillMode_t fillMode; 1001 cusparseDiagType_t diagType; 1002 cudaError_t cerr; 1003 PetscErrorCode ierr; 1004 1005 PetscFunctionBegin; 1006 /* allocate space for the transpose of the lower triangular factor */ 1007 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1008 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1009 1010 /* set the matrix descriptors of the lower triangular factor */ 1011 matrixType = cusparseGetMatType(loTriFactor->descr); 1012 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1013 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1014 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1015 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1016 1017 /* Create the matrix description */ 1018 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1019 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1020 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1021 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1022 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1023 1024 /* set the operation */ 1025 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1026 1027 /* allocate GPU space for the CSC of the lower triangular factor*/ 1028 loTriFactorT->csrMat = new CsrMatrix; 1029 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1030 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1031 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1032 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1033 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1034 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1035 1036 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1037 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1038 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1039 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1040 loTriFactor->csrMat->values->data().get(), 1041 loTriFactor->csrMat->row_offsets->data().get(), 1042 loTriFactor->csrMat->column_indices->data().get(), 1043 loTriFactorT->csrMat->values->data().get(), 1044 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1045 CUSPARSE_ACTION_NUMERIC,indexBase, 1046 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1047 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1048 #endif 1049 1050 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1051 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1052 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1053 loTriFactor->csrMat->values->data().get(), 1054 loTriFactor->csrMat->row_offsets->data().get(), 1055 loTriFactor->csrMat->column_indices->data().get(), 1056 loTriFactorT->csrMat->values->data().get(), 1057 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1058 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1059 CUSPARSE_ACTION_NUMERIC, indexBase, 1060 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 1061 #else 1062 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1063 CUSPARSE_ACTION_NUMERIC, indexBase 1064 #endif 1065 );CHKERRCUSPARSE(stat); 1066 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1067 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1068 1069 /* Create the solve analysis information */ 1070 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1071 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1072 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1073 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1074 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1075 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1076 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1077 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1078 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1079 #endif 1080 1081 /* perform the solve analysis */ 1082 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1083 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1084 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1085 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 1086 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1087 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1088 #endif 1089 );CHKERRCUSPARSE(stat); 1090 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1091 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1092 1093 /* assign the pointer */ 1094 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1095 1096 /*********************************************/ 1097 /* Now the Transpose of the Upper Tri Factor */ 1098 /*********************************************/ 1099 1100 /* allocate space for the transpose of the upper triangular factor */ 1101 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1102 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1103 1104 /* set the matrix descriptors of the upper triangular factor */ 1105 matrixType = cusparseGetMatType(upTriFactor->descr); 1106 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1107 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1108 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1109 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1110 1111 /* Create the matrix description */ 1112 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1113 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1114 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1115 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1116 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1117 1118 /* set the operation */ 1119 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1120 1121 /* allocate GPU space for the CSC of the upper triangular factor*/ 1122 upTriFactorT->csrMat = new CsrMatrix; 1123 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1124 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1125 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1126 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1127 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1128 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1129 1130 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1131 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1132 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1133 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1134 upTriFactor->csrMat->values->data().get(), 1135 upTriFactor->csrMat->row_offsets->data().get(), 1136 upTriFactor->csrMat->column_indices->data().get(), 1137 upTriFactorT->csrMat->values->data().get(), 1138 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1139 CUSPARSE_ACTION_NUMERIC,indexBase, 1140 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1141 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1142 #endif 1143 1144 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1145 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1146 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1147 upTriFactor->csrMat->values->data().get(), 1148 upTriFactor->csrMat->row_offsets->data().get(), 1149 upTriFactor->csrMat->column_indices->data().get(), 1150 upTriFactorT->csrMat->values->data().get(), 1151 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1152 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1153 CUSPARSE_ACTION_NUMERIC, indexBase, 1154 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1155 #else 1156 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1157 CUSPARSE_ACTION_NUMERIC, indexBase 1158 #endif 1159 );CHKERRCUSPARSE(stat); 1160 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1161 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1162 1163 /* Create the solve analysis information */ 1164 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1165 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1166 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1167 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1168 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1169 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1170 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1171 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1172 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1173 #endif 1174 1175 /* perform the solve analysis */ 1176 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1177 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1178 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1179 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1180 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1181 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1182 #endif 1183 );CHKERRCUSPARSE(stat); 1184 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1185 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1186 1187 /* assign the pointer */ 1188 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1189 PetscFunctionReturn(0); 1190 } 1191 1192 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 1193 { 1194 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1195 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1196 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1197 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1198 cusparseStatus_t stat; 1199 cusparseIndexBase_t indexBase; 1200 cudaError_t err; 1201 PetscErrorCode ierr; 1202 1203 PetscFunctionBegin; 1204 if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0); 1205 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1206 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1207 /* create cusparse matrix */ 1208 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1209 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1210 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1211 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1212 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1213 1214 /* set alpha and beta */ 1215 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1216 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1217 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1218 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1219 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1220 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1221 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1222 1223 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1224 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1225 CsrMatrix *matrixT= new CsrMatrix; 1226 matrixT->num_rows = A->cmap->n; 1227 matrixT->num_cols = A->rmap->n; 1228 matrixT->num_entries = a->nz; 1229 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1230 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1231 matrixT->values = new THRUSTARRAY(a->nz); 1232 1233 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 1234 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1235 1236 /* compute the transpose, i.e. the CSC */ 1237 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1238 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1239 A->cmap->n, matrix->num_entries, 1240 matrix->values->data().get(), 1241 cusparsestruct->rowoffsets_gpu->data().get(), 1242 matrix->column_indices->data().get(), 1243 matrixT->values->data().get(), 1244 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1245 CUSPARSE_ACTION_NUMERIC,indexBase, 1246 cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1247 err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err); 1248 #endif 1249 1250 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1251 A->cmap->n, matrix->num_entries, 1252 matrix->values->data().get(), 1253 cusparsestruct->rowoffsets_gpu->data().get(), 1254 matrix->column_indices->data().get(), 1255 matrixT->values->data().get(), 1256 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1257 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1258 CUSPARSE_ACTION_NUMERIC,indexBase, 1259 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1260 #else 1261 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1262 CUSPARSE_ACTION_NUMERIC, indexBase 1263 #endif 1264 );CHKERRCUSPARSE(stat); 1265 matstructT->mat = matrixT; 1266 1267 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1268 stat = cusparseCreateCsr(&matstructT->matDescr, 1269 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1270 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1271 matrixT->values->data().get(), 1272 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1273 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1274 #endif 1275 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1276 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1277 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1278 #else 1279 CsrMatrix *temp = new CsrMatrix; 1280 CsrMatrix *tempT = new CsrMatrix; 1281 /* First convert HYB to CSR */ 1282 temp->num_rows = A->rmap->n; 1283 temp->num_cols = A->cmap->n; 1284 temp->num_entries = a->nz; 1285 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1286 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1287 temp->values = new THRUSTARRAY(a->nz); 1288 1289 stat = cusparse_hyb2csr(cusparsestruct->handle, 1290 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1291 temp->values->data().get(), 1292 temp->row_offsets->data().get(), 1293 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1294 1295 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1296 tempT->num_rows = A->rmap->n; 1297 tempT->num_cols = A->cmap->n; 1298 tempT->num_entries = a->nz; 1299 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1300 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1301 tempT->values = new THRUSTARRAY(a->nz); 1302 1303 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1304 temp->num_cols, temp->num_entries, 1305 temp->values->data().get(), 1306 temp->row_offsets->data().get(), 1307 temp->column_indices->data().get(), 1308 tempT->values->data().get(), 1309 tempT->column_indices->data().get(), 1310 tempT->row_offsets->data().get(), 1311 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1312 1313 /* Last, convert CSC to HYB */ 1314 cusparseHybMat_t hybMat; 1315 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1316 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1317 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1318 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1319 matstructT->descr, tempT->values->data().get(), 1320 tempT->row_offsets->data().get(), 1321 tempT->column_indices->data().get(), 1322 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1323 1324 /* assign the pointer */ 1325 matstructT->mat = hybMat; 1326 /* delete temporaries */ 1327 if (tempT) { 1328 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1329 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1330 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1331 delete (CsrMatrix*) tempT; 1332 } 1333 if (temp) { 1334 if (temp->values) delete (THRUSTARRAY*) temp->values; 1335 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1336 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1337 delete (CsrMatrix*) temp; 1338 } 1339 #endif 1340 } 1341 err = WaitForCUDA();CHKERRCUDA(err); 1342 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1343 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1344 /* the compressed row indices is not used for matTranspose */ 1345 matstructT->cprowIndices = NULL; 1346 /* assign the pointer */ 1347 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1348 PetscFunctionReturn(0); 1349 } 1350 1351 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1352 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1353 { 1354 PetscInt n = xx->map->n; 1355 const PetscScalar *barray; 1356 PetscScalar *xarray; 1357 thrust::device_ptr<const PetscScalar> bGPU; 1358 thrust::device_ptr<PetscScalar> xGPU; 1359 cusparseStatus_t stat; 1360 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1361 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1362 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1363 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1364 PetscErrorCode ierr; 1365 cudaError_t cerr; 1366 1367 PetscFunctionBegin; 1368 /* Analyze the matrix and create the transpose ... on the fly */ 1369 if (!loTriFactorT && !upTriFactorT) { 1370 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1371 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1372 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1373 } 1374 1375 /* Get the GPU pointers */ 1376 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1377 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1378 xGPU = thrust::device_pointer_cast(xarray); 1379 bGPU = thrust::device_pointer_cast(barray); 1380 1381 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1382 /* First, reorder with the row permutation */ 1383 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1384 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1385 xGPU); 1386 1387 /* First, solve U */ 1388 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1389 upTriFactorT->csrMat->num_rows, 1390 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1391 upTriFactorT->csrMat->num_entries, 1392 #endif 1393 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1394 upTriFactorT->csrMat->values->data().get(), 1395 upTriFactorT->csrMat->row_offsets->data().get(), 1396 upTriFactorT->csrMat->column_indices->data().get(), 1397 upTriFactorT->solveInfo, 1398 xarray, tempGPU->data().get() 1399 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1400 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1401 #endif 1402 );CHKERRCUSPARSE(stat); 1403 1404 /* Then, solve L */ 1405 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1406 loTriFactorT->csrMat->num_rows, 1407 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1408 loTriFactorT->csrMat->num_entries, 1409 #endif 1410 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1411 loTriFactorT->csrMat->values->data().get(), 1412 loTriFactorT->csrMat->row_offsets->data().get(), 1413 loTriFactorT->csrMat->column_indices->data().get(), 1414 loTriFactorT->solveInfo, 1415 tempGPU->data().get(), xarray 1416 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1417 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1418 #endif 1419 );CHKERRCUSPARSE(stat); 1420 1421 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1422 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1423 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1424 tempGPU->begin()); 1425 1426 /* Copy the temporary to the full solution. */ 1427 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1428 1429 /* restore */ 1430 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1431 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1432 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1433 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1434 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1435 PetscFunctionReturn(0); 1436 } 1437 1438 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1439 { 1440 const PetscScalar *barray; 1441 PetscScalar *xarray; 1442 cusparseStatus_t stat; 1443 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1444 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1445 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1446 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1447 PetscErrorCode ierr; 1448 cudaError_t cerr; 1449 1450 PetscFunctionBegin; 1451 /* Analyze the matrix and create the transpose ... on the fly */ 1452 if (!loTriFactorT && !upTriFactorT) { 1453 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1454 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1455 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1456 } 1457 1458 /* Get the GPU pointers */ 1459 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1460 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1461 1462 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1463 /* First, solve U */ 1464 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1465 upTriFactorT->csrMat->num_rows, 1466 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1467 upTriFactorT->csrMat->num_entries, 1468 #endif 1469 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1470 upTriFactorT->csrMat->values->data().get(), 1471 upTriFactorT->csrMat->row_offsets->data().get(), 1472 upTriFactorT->csrMat->column_indices->data().get(), 1473 upTriFactorT->solveInfo, 1474 barray, tempGPU->data().get() 1475 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1476 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1477 #endif 1478 );CHKERRCUSPARSE(stat); 1479 1480 /* Then, solve L */ 1481 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1482 loTriFactorT->csrMat->num_rows, 1483 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1484 loTriFactorT->csrMat->num_entries, 1485 #endif 1486 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1487 loTriFactorT->csrMat->values->data().get(), 1488 loTriFactorT->csrMat->row_offsets->data().get(), 1489 loTriFactorT->csrMat->column_indices->data().get(), 1490 loTriFactorT->solveInfo, 1491 tempGPU->data().get(), xarray 1492 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1493 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1494 #endif 1495 );CHKERRCUSPARSE(stat); 1496 1497 /* restore */ 1498 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1499 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1500 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1501 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1502 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1503 PetscFunctionReturn(0); 1504 } 1505 1506 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1507 { 1508 const PetscScalar *barray; 1509 PetscScalar *xarray; 1510 thrust::device_ptr<const PetscScalar> bGPU; 1511 thrust::device_ptr<PetscScalar> xGPU; 1512 cusparseStatus_t stat; 1513 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1514 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1515 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1516 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1517 PetscErrorCode ierr; 1518 cudaError_t cerr; 1519 1520 PetscFunctionBegin; 1521 1522 /* Get the GPU pointers */ 1523 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1524 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1525 xGPU = thrust::device_pointer_cast(xarray); 1526 bGPU = thrust::device_pointer_cast(barray); 1527 1528 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1529 /* First, reorder with the row permutation */ 1530 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1531 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1532 tempGPU->begin()); 1533 1534 /* Next, solve L */ 1535 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1536 loTriFactor->csrMat->num_rows, 1537 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1538 loTriFactor->csrMat->num_entries, 1539 #endif 1540 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1541 loTriFactor->csrMat->values->data().get(), 1542 loTriFactor->csrMat->row_offsets->data().get(), 1543 loTriFactor->csrMat->column_indices->data().get(), 1544 loTriFactor->solveInfo, 1545 tempGPU->data().get(), xarray 1546 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1547 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1548 #endif 1549 );CHKERRCUSPARSE(stat); 1550 1551 /* Then, solve U */ 1552 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1553 upTriFactor->csrMat->num_rows, 1554 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1555 upTriFactor->csrMat->num_entries, 1556 #endif 1557 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1558 upTriFactor->csrMat->values->data().get(), 1559 upTriFactor->csrMat->row_offsets->data().get(), 1560 upTriFactor->csrMat->column_indices->data().get(), 1561 upTriFactor->solveInfo, 1562 xarray, tempGPU->data().get() 1563 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1564 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1565 #endif 1566 );CHKERRCUSPARSE(stat); 1567 1568 /* Last, reorder with the column permutation */ 1569 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1570 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1571 xGPU); 1572 1573 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1574 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1575 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1576 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1577 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1578 PetscFunctionReturn(0); 1579 } 1580 1581 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1582 { 1583 const PetscScalar *barray; 1584 PetscScalar *xarray; 1585 cusparseStatus_t stat; 1586 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1587 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1588 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1589 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1590 PetscErrorCode ierr; 1591 cudaError_t cerr; 1592 1593 PetscFunctionBegin; 1594 /* Get the GPU pointers */ 1595 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1596 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1597 1598 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1599 /* First, solve L */ 1600 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1601 loTriFactor->csrMat->num_rows, 1602 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1603 loTriFactor->csrMat->num_entries, 1604 #endif 1605 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1606 loTriFactor->csrMat->values->data().get(), 1607 loTriFactor->csrMat->row_offsets->data().get(), 1608 loTriFactor->csrMat->column_indices->data().get(), 1609 loTriFactor->solveInfo, 1610 barray, tempGPU->data().get() 1611 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1612 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1613 #endif 1614 );CHKERRCUSPARSE(stat); 1615 1616 /* Next, solve U */ 1617 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1618 upTriFactor->csrMat->num_rows, 1619 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1620 upTriFactor->csrMat->num_entries, 1621 #endif 1622 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1623 upTriFactor->csrMat->values->data().get(), 1624 upTriFactor->csrMat->row_offsets->data().get(), 1625 upTriFactor->csrMat->column_indices->data().get(), 1626 upTriFactor->solveInfo, 1627 tempGPU->data().get(), xarray 1628 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1629 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1630 #endif 1631 );CHKERRCUSPARSE(stat); 1632 1633 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1634 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1635 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1636 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1637 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1638 PetscFunctionReturn(0); 1639 } 1640 1641 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1642 { 1643 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1644 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1645 cudaError_t cerr; 1646 PetscErrorCode ierr; 1647 1648 PetscFunctionBegin; 1649 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1650 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1651 1652 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1653 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1654 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1655 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1656 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1657 A->offloadmask = PETSC_OFFLOAD_BOTH; 1658 } 1659 PetscFunctionReturn(0); 1660 } 1661 1662 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1663 { 1664 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1665 PetscErrorCode ierr; 1666 1667 PetscFunctionBegin; 1668 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1669 *array = a->a; 1670 A->offloadmask = PETSC_OFFLOAD_CPU; 1671 PetscFunctionReturn(0); 1672 } 1673 1674 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1675 { 1676 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1677 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1678 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1679 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1680 PetscErrorCode ierr; 1681 cusparseStatus_t stat; 1682 cudaError_t err; 1683 1684 PetscFunctionBegin; 1685 if (A->boundtocpu) PetscFunctionReturn(0); 1686 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1687 if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1688 /* Copy values only */ 1689 CsrMatrix *matrix,*matrixT; 1690 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1691 1692 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1693 matrix->values->assign(a->a, a->a+a->nz); 1694 err = WaitForCUDA();CHKERRCUDA(err); 1695 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1696 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1697 1698 /* Update matT when it was built before */ 1699 if (cusparsestruct->matTranspose) { 1700 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1701 matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1702 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1703 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1704 A->cmap->n, matrix->num_entries, 1705 matrix->values->data().get(), 1706 cusparsestruct->rowoffsets_gpu->data().get(), 1707 matrix->column_indices->data().get(), 1708 matrixT->values->data().get(), 1709 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1710 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1711 CUSPARSE_ACTION_NUMERIC,indexBase, 1712 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1713 #else 1714 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1715 CUSPARSE_ACTION_NUMERIC, indexBase 1716 #endif 1717 );CHKERRCUSPARSE(stat); 1718 err = WaitForCUDA();CHKERRCUDA(err); 1719 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1720 } 1721 } else { 1722 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1723 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1724 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1725 delete cusparsestruct->workVector; 1726 delete cusparsestruct->rowoffsets_gpu; 1727 try { 1728 if (a->compressedrow.use) { 1729 m = a->compressedrow.nrows; 1730 ii = a->compressedrow.i; 1731 ridx = a->compressedrow.rindex; 1732 } else { 1733 m = A->rmap->n; 1734 ii = a->i; 1735 ridx = NULL; 1736 } 1737 cusparsestruct->nrows = m; 1738 1739 /* create cusparse matrix */ 1740 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1741 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1742 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1743 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1744 1745 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1746 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1747 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1748 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1749 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1750 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1751 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1752 1753 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1754 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1755 /* set the matrix */ 1756 CsrMatrix *mat= new CsrMatrix; 1757 mat->num_rows = m; 1758 mat->num_cols = A->cmap->n; 1759 mat->num_entries = a->nz; 1760 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1761 mat->row_offsets->assign(ii, ii + m+1); 1762 1763 mat->column_indices = new THRUSTINTARRAY32(a->nz); 1764 mat->column_indices->assign(a->j, a->j+a->nz); 1765 1766 mat->values = new THRUSTARRAY(a->nz); 1767 mat->values->assign(a->a, a->a+a->nz); 1768 1769 /* assign the pointer */ 1770 matstruct->mat = mat; 1771 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1772 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1773 stat = cusparseCreateCsr(&matstruct->matDescr, 1774 mat->num_rows, mat->num_cols, mat->num_entries, 1775 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1776 mat->values->data().get(), 1777 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1778 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1779 } 1780 #endif 1781 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1782 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1783 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1784 #else 1785 CsrMatrix *mat= new CsrMatrix; 1786 mat->num_rows = m; 1787 mat->num_cols = A->cmap->n; 1788 mat->num_entries = a->nz; 1789 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1790 mat->row_offsets->assign(ii, ii + m+1); 1791 1792 mat->column_indices = new THRUSTINTARRAY32(a->nz); 1793 mat->column_indices->assign(a->j, a->j+a->nz); 1794 1795 mat->values = new THRUSTARRAY(a->nz); 1796 mat->values->assign(a->a, a->a+a->nz); 1797 1798 cusparseHybMat_t hybMat; 1799 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1800 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1801 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1802 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1803 matstruct->descr, mat->values->data().get(), 1804 mat->row_offsets->data().get(), 1805 mat->column_indices->data().get(), 1806 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1807 /* assign the pointer */ 1808 matstruct->mat = hybMat; 1809 1810 if (mat) { 1811 if (mat->values) delete (THRUSTARRAY*)mat->values; 1812 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1813 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1814 delete (CsrMatrix*)mat; 1815 } 1816 #endif 1817 } 1818 1819 /* assign the compressed row indices */ 1820 if (a->compressedrow.use) { 1821 cusparsestruct->workVector = new THRUSTARRAY(m); 1822 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1823 matstruct->cprowIndices->assign(ridx,ridx+m); 1824 tmp = m; 1825 } else { 1826 cusparsestruct->workVector = NULL; 1827 matstruct->cprowIndices = NULL; 1828 tmp = 0; 1829 } 1830 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1831 1832 /* assign the pointer */ 1833 cusparsestruct->mat = matstruct; 1834 } catch(char *ex) { 1835 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1836 } 1837 err = WaitForCUDA();CHKERRCUDA(err); 1838 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1839 cusparsestruct->nonzerostate = A->nonzerostate; 1840 } 1841 A->offloadmask = PETSC_OFFLOAD_BOTH; 1842 } 1843 PetscFunctionReturn(0); 1844 } 1845 1846 struct VecCUDAPlusEquals 1847 { 1848 template <typename Tuple> 1849 __host__ __device__ 1850 void operator()(Tuple t) 1851 { 1852 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1853 } 1854 }; 1855 1856 struct VecCUDAEquals 1857 { 1858 template <typename Tuple> 1859 __host__ __device__ 1860 void operator()(Tuple t) 1861 { 1862 thrust::get<1>(t) = thrust::get<0>(t); 1863 } 1864 }; 1865 1866 struct VecCUDAEqualsReverse 1867 { 1868 template <typename Tuple> 1869 __host__ __device__ 1870 void operator()(Tuple t) 1871 { 1872 thrust::get<0>(t) = thrust::get<1>(t); 1873 } 1874 }; 1875 1876 struct MatMatCusparse { 1877 PetscBool cisdense; 1878 PetscScalar *Bt; 1879 Mat X; 1880 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1881 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1882 cusparseDnMatDescr_t matBDescr; 1883 cusparseDnMatDescr_t matCDescr; 1884 size_t spmmBufferSize; 1885 void *spmmBuffer; 1886 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1887 #endif 1888 }; 1889 1890 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1891 { 1892 PetscErrorCode ierr; 1893 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1894 cudaError_t cerr; 1895 1896 PetscFunctionBegin; 1897 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1898 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1899 cusparseStatus_t stat; 1900 if (mmdata->matBDescr) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);} 1901 if (mmdata->matCDescr) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);} 1902 if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 1903 #endif 1904 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1905 ierr = PetscFree(data);CHKERRQ(ierr); 1906 PetscFunctionReturn(0); 1907 } 1908 1909 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1910 1911 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1912 { 1913 Mat_Product *product = C->product; 1914 Mat A,B; 1915 PetscInt m,n,blda,clda; 1916 PetscBool flg,biscuda; 1917 Mat_SeqAIJCUSPARSE *cusp; 1918 cusparseStatus_t stat; 1919 cusparseOperation_t opA; 1920 const PetscScalar *barray; 1921 PetscScalar *carray; 1922 PetscErrorCode ierr; 1923 MatMatCusparse *mmdata; 1924 Mat_SeqAIJCUSPARSEMultStruct *mat; 1925 CsrMatrix *csrmat; 1926 cudaError_t cerr; 1927 1928 PetscFunctionBegin; 1929 MatCheckProduct(C,1); 1930 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1931 mmdata = (MatMatCusparse*)product->data; 1932 A = product->A; 1933 B = product->B; 1934 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1935 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1936 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1937 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1938 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1939 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1940 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1941 switch (product->type) { 1942 case MATPRODUCT_AB: 1943 case MATPRODUCT_PtAP: 1944 mat = cusp->mat; 1945 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1946 m = A->rmap->n; 1947 n = B->cmap->n; 1948 break; 1949 case MATPRODUCT_AtB: 1950 if (!cusp->transgen) { 1951 mat = cusp->mat; 1952 opA = CUSPARSE_OPERATION_TRANSPOSE; 1953 } else { 1954 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1955 mat = cusp->matTranspose; 1956 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1957 } 1958 m = A->cmap->n; 1959 n = B->cmap->n; 1960 break; 1961 case MATPRODUCT_ABt: 1962 case MATPRODUCT_RARt: 1963 mat = cusp->mat; 1964 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1965 m = A->rmap->n; 1966 n = B->rmap->n; 1967 break; 1968 default: 1969 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1970 } 1971 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1972 csrmat = (CsrMatrix*)mat->mat; 1973 /* if the user passed a CPU matrix, copy the data to the GPU */ 1974 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1975 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 1976 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 1977 1978 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 1979 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1980 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1981 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 1982 } else { 1983 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 1984 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 1985 } 1986 1987 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1988 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1989 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 1990 /* (re)allcoate spmmBuffer if not initialized or LDAs are different */ 1991 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 1992 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 1993 if (!mmdata->matBDescr) { 1994 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 1995 mmdata->Blda = blda; 1996 } 1997 1998 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 1999 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2000 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2001 mmdata->Clda = clda; 2002 } 2003 2004 if (!mat->matDescr) { 2005 stat = cusparseCreateCsr(&mat->matDescr, 2006 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2007 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2008 csrmat->values->data().get(), 2009 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2010 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2011 } 2012 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2013 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2014 mmdata->matCDescr,cusparse_scalartype, 2015 cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat); 2016 if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 2017 cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr); 2018 mmdata->initialized = PETSC_TRUE; 2019 } else { 2020 /* to be safe, always update pointers of the mats */ 2021 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2022 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2023 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2024 } 2025 2026 /* do cusparseSpMM, which supports transpose on B */ 2027 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2028 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2029 mmdata->matCDescr,cusparse_scalartype, 2030 cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat); 2031 #else 2032 PetscInt k; 2033 /* cusparseXcsrmm does not support transpose on B */ 2034 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2035 cublasHandle_t cublasv2handle; 2036 cublasStatus_t cerr; 2037 2038 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2039 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2040 B->cmap->n,B->rmap->n, 2041 &PETSC_CUSPARSE_ONE ,barray,blda, 2042 &PETSC_CUSPARSE_ZERO,barray,blda, 2043 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2044 blda = B->cmap->n; 2045 k = B->cmap->n; 2046 } else { 2047 k = B->rmap->n; 2048 } 2049 2050 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2051 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2052 csrmat->num_entries,mat->alpha_one,mat->descr, 2053 csrmat->values->data().get(), 2054 csrmat->row_offsets->data().get(), 2055 csrmat->column_indices->data().get(), 2056 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2057 carray,clda);CHKERRCUSPARSE(stat); 2058 #endif 2059 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2060 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2061 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2062 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2063 if (product->type == MATPRODUCT_RARt) { 2064 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2065 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2066 } else if (product->type == MATPRODUCT_PtAP) { 2067 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2068 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2069 } else { 2070 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2071 } 2072 if (mmdata->cisdense) { 2073 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2074 } 2075 if (!biscuda) { 2076 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2077 } 2078 PetscFunctionReturn(0); 2079 } 2080 2081 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2082 { 2083 Mat_Product *product = C->product; 2084 Mat A,B; 2085 PetscInt m,n; 2086 PetscBool cisdense,flg; 2087 PetscErrorCode ierr; 2088 MatMatCusparse *mmdata; 2089 Mat_SeqAIJCUSPARSE *cusp; 2090 2091 PetscFunctionBegin; 2092 MatCheckProduct(C,1); 2093 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2094 A = product->A; 2095 B = product->B; 2096 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2097 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2098 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2099 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2100 switch (product->type) { 2101 case MATPRODUCT_AB: 2102 m = A->rmap->n; 2103 n = B->cmap->n; 2104 break; 2105 case MATPRODUCT_AtB: 2106 m = A->cmap->n; 2107 n = B->cmap->n; 2108 break; 2109 case MATPRODUCT_ABt: 2110 m = A->rmap->n; 2111 n = B->rmap->n; 2112 break; 2113 case MATPRODUCT_PtAP: 2114 m = B->cmap->n; 2115 n = B->cmap->n; 2116 break; 2117 case MATPRODUCT_RARt: 2118 m = B->rmap->n; 2119 n = B->rmap->n; 2120 break; 2121 default: 2122 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2123 } 2124 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2125 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2126 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2127 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2128 2129 /* product data */ 2130 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2131 mmdata->cisdense = cisdense; 2132 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2133 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2134 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2135 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2136 } 2137 #endif 2138 /* for these products we need intermediate storage */ 2139 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2140 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2141 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2142 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2143 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2144 } else { 2145 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2146 } 2147 } 2148 C->product->data = mmdata; 2149 C->product->destroy = MatDestroy_MatMatCusparse; 2150 2151 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2152 PetscFunctionReturn(0); 2153 } 2154 2155 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2156 2157 /* handles dense B */ 2158 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C) 2159 { 2160 Mat_Product *product = C->product; 2161 PetscErrorCode ierr; 2162 2163 PetscFunctionBegin; 2164 MatCheckProduct(C,1); 2165 if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 2166 if (product->A->boundtocpu) { 2167 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr); 2168 PetscFunctionReturn(0); 2169 } 2170 switch (product->type) { 2171 case MATPRODUCT_AB: 2172 case MATPRODUCT_AtB: 2173 case MATPRODUCT_ABt: 2174 case MATPRODUCT_PtAP: 2175 case MATPRODUCT_RARt: 2176 C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2177 default: 2178 break; 2179 } 2180 PetscFunctionReturn(0); 2181 } 2182 2183 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2184 { 2185 PetscErrorCode ierr; 2186 2187 PetscFunctionBegin; 2188 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2189 PetscFunctionReturn(0); 2190 } 2191 2192 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2193 { 2194 PetscErrorCode ierr; 2195 2196 PetscFunctionBegin; 2197 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2198 PetscFunctionReturn(0); 2199 } 2200 2201 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2202 { 2203 PetscErrorCode ierr; 2204 2205 PetscFunctionBegin; 2206 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2207 PetscFunctionReturn(0); 2208 } 2209 2210 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2211 { 2212 PetscErrorCode ierr; 2213 2214 PetscFunctionBegin; 2215 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2216 PetscFunctionReturn(0); 2217 } 2218 2219 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2220 { 2221 PetscErrorCode ierr; 2222 2223 PetscFunctionBegin; 2224 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2225 PetscFunctionReturn(0); 2226 } 2227 2228 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2229 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2230 { 2231 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2232 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2233 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2234 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2235 PetscErrorCode ierr; 2236 cudaError_t cerr; 2237 cusparseStatus_t stat; 2238 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2239 PetscBool compressed; 2240 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2241 PetscInt nx,ny; 2242 #endif 2243 2244 PetscFunctionBegin; 2245 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2246 if (!a->nonzerorowcnt) { 2247 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2248 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2249 PetscFunctionReturn(0); 2250 } 2251 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2252 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2253 if (!trans) { 2254 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2255 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2256 } else { 2257 if (herm || !cusparsestruct->transgen) { 2258 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2259 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2260 } else { 2261 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);} 2262 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2263 } 2264 } 2265 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2266 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2267 2268 try { 2269 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2270 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2271 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2272 2273 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2274 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2275 /* z = A x + beta y. 2276 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2277 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2278 */ 2279 xptr = xarray; 2280 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2281 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2282 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2283 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2284 allocated to accommodate different uses. So we get the length info directly from mat. 2285 */ 2286 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2287 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2288 nx = mat->num_cols; 2289 ny = mat->num_rows; 2290 } 2291 #endif 2292 } else { 2293 /* z = A^T x + beta y 2294 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2295 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2296 */ 2297 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2298 dptr = zarray; 2299 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2300 if (compressed) { /* Scatter x to work vector */ 2301 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2302 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2303 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2304 VecCUDAEqualsReverse()); 2305 } 2306 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2307 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2308 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2309 nx = mat->num_rows; 2310 ny = mat->num_cols; 2311 } 2312 #endif 2313 } 2314 2315 /* csr_spmv does y = alpha op(A) x + beta y */ 2316 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2317 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2318 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2319 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2320 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2321 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2322 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2323 matstruct->matDescr, 2324 matstruct->cuSpMV[opA].vecXDescr, beta, 2325 matstruct->cuSpMV[opA].vecYDescr, 2326 cusparse_scalartype, 2327 cusparsestruct->spmvAlg, 2328 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2329 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2330 2331 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2332 } else { 2333 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2334 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2335 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2336 } 2337 2338 stat = cusparseSpMV(cusparsestruct->handle, opA, 2339 matstruct->alpha_one, 2340 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */ 2341 matstruct->cuSpMV[opA].vecXDescr, 2342 beta, 2343 matstruct->cuSpMV[opA].vecYDescr, 2344 cusparse_scalartype, 2345 cusparsestruct->spmvAlg, 2346 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2347 #else 2348 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2349 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2350 mat->num_rows, mat->num_cols, 2351 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2352 mat->values->data().get(), mat->row_offsets->data().get(), 2353 mat->column_indices->data().get(), xptr, beta, 2354 dptr);CHKERRCUSPARSE(stat); 2355 #endif 2356 } else { 2357 if (cusparsestruct->nrows) { 2358 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2359 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2360 #else 2361 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2362 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2363 matstruct->alpha_one, matstruct->descr, hybMat, 2364 xptr, beta, 2365 dptr);CHKERRCUSPARSE(stat); 2366 #endif 2367 } 2368 } 2369 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2370 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2371 2372 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2373 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2374 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2375 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2376 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2377 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2378 } 2379 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2380 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2381 } 2382 2383 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2384 if (compressed) { 2385 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2386 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2387 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2388 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2389 VecCUDAPlusEquals()); 2390 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2391 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2392 } 2393 } else { 2394 if (yy && yy != zz) { 2395 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2396 } 2397 } 2398 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2399 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2400 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2401 } catch(char *ex) { 2402 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2403 } 2404 if (yy) { 2405 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2406 } else { 2407 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2408 } 2409 PetscFunctionReturn(0); 2410 } 2411 2412 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2413 { 2414 PetscErrorCode ierr; 2415 2416 PetscFunctionBegin; 2417 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2418 PetscFunctionReturn(0); 2419 } 2420 2421 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2422 { 2423 PetscErrorCode ierr; 2424 PetscSplitCSRDataStructure *d_mat = NULL; 2425 PetscFunctionBegin; 2426 if (A->factortype == MAT_FACTOR_NONE) { 2427 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2428 } 2429 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2430 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2431 if (d_mat) { 2432 A->offloadmask = PETSC_OFFLOAD_GPU; 2433 } 2434 2435 PetscFunctionReturn(0); 2436 } 2437 2438 /* --------------------------------------------------------------------------------*/ 2439 /*@ 2440 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2441 (the default parallel PETSc format). This matrix will ultimately pushed down 2442 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2443 assembly performance the user should preallocate the matrix storage by setting 2444 the parameter nz (or the array nnz). By setting these parameters accurately, 2445 performance during matrix assembly can be increased by more than a factor of 50. 2446 2447 Collective 2448 2449 Input Parameters: 2450 + comm - MPI communicator, set to PETSC_COMM_SELF 2451 . m - number of rows 2452 . n - number of columns 2453 . nz - number of nonzeros per row (same for all rows) 2454 - nnz - array containing the number of nonzeros in the various rows 2455 (possibly different for each row) or NULL 2456 2457 Output Parameter: 2458 . A - the matrix 2459 2460 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2461 MatXXXXSetPreallocation() paradgm instead of this routine directly. 2462 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 2463 2464 Notes: 2465 If nnz is given then nz is ignored 2466 2467 The AIJ format (also called the Yale sparse matrix format or 2468 compressed row storage), is fully compatible with standard Fortran 77 2469 storage. That is, the stored row and column indices can begin at 2470 either one (as in Fortran) or zero. See the users' manual for details. 2471 2472 Specify the preallocated storage with either nz or nnz (not both). 2473 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 2474 allocation. For large problems you MUST preallocate memory or you 2475 will get TERRIBLE performance, see the users' manual chapter on matrices. 2476 2477 By default, this format uses inodes (identical nodes) when possible, to 2478 improve numerical efficiency of matrix-vector products and solves. We 2479 search for consecutive rows with the same nonzero structure, thereby 2480 reusing matrix information to achieve increased efficiency. 2481 2482 Level: intermediate 2483 2484 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 2485 @*/ 2486 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 2487 { 2488 PetscErrorCode ierr; 2489 2490 PetscFunctionBegin; 2491 ierr = MatCreate(comm,A);CHKERRQ(ierr); 2492 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 2493 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2494 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 2495 PetscFunctionReturn(0); 2496 } 2497 2498 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 2499 { 2500 PetscErrorCode ierr; 2501 PetscSplitCSRDataStructure *d_mat = NULL; 2502 2503 PetscFunctionBegin; 2504 if (A->factortype == MAT_FACTOR_NONE) { 2505 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2506 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 2507 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 2508 } else { 2509 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 2510 } 2511 if (d_mat) { 2512 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2513 cudaError_t err; 2514 PetscSplitCSRDataStructure h_mat; 2515 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 2516 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2517 if (a->compressedrow.use) { 2518 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 2519 } 2520 err = cudaFree(d_mat);CHKERRCUDA(err); 2521 } 2522 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 2523 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2524 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 2525 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 2526 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 2527 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 2528 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 2529 PetscFunctionReturn(0); 2530 } 2531 2532 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 2533 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 2534 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 2535 { 2536 PetscErrorCode ierr; 2537 2538 PetscFunctionBegin; 2539 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 2540 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 2541 PetscFunctionReturn(0); 2542 } 2543 2544 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) // put axpy in aijcusparse, etc. 2545 { 2546 PetscErrorCode ierr; 2547 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 2548 PetscBool flgx,flgy; 2549 2550 PetscFunctionBegin; 2551 if (a == (PetscScalar)0.0) PetscFunctionReturn(0); 2552 PetscValidHeaderSpecific(Y,MAT_CLASSID,1); 2553 PetscValidHeaderSpecific(X,MAT_CLASSID,3); 2554 ierr = PetscObjectTypeCompare((PetscObject)Y,MATSEQAIJCUSPARSE,&flgy);CHKERRQ(ierr); 2555 ierr = PetscObjectTypeCompare((PetscObject)X,MATSEQAIJCUSPARSE,&flgx);CHKERRQ(ierr); 2556 if (!flgx || !flgy) { 2557 ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr); 2558 PetscFunctionReturn(0); 2559 } 2560 if (Y->factortype != MAT_FACTOR_NONE || X->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"both matrices must be MAT_FACTOR_NONE"); 2561 if (str == DIFFERENT_NONZERO_PATTERN) { 2562 if (x->nz == y->nz) { 2563 PetscBool e; 2564 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 2565 if (e) { 2566 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 2567 if (e) { 2568 str = SAME_NONZERO_PATTERN; 2569 } 2570 } 2571 } 2572 } 2573 if (str != SAME_NONZERO_PATTERN) { 2574 ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr); 2575 PetscFunctionReturn(0); 2576 } else { 2577 Mat_SeqAIJCUSPARSE *cusparsestruct_y = (Mat_SeqAIJCUSPARSE*)Y->spptr; 2578 Mat_SeqAIJCUSPARSE *cusparsestruct_x = (Mat_SeqAIJCUSPARSE*)X->spptr; 2579 if (cusparsestruct_y->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 2580 if (cusparsestruct_x->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 2581 if (!cusparsestruct_y->mat || !cusparsestruct_x->mat) { 2582 if (Y->offloadmask == PETSC_OFFLOAD_UNALLOCATED || Y->offloadmask == PETSC_OFFLOAD_GPU) { 2583 ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr); 2584 } 2585 if (X->offloadmask == PETSC_OFFLOAD_UNALLOCATED || X->offloadmask == PETSC_OFFLOAD_GPU) { 2586 ierr = MatSeqAIJCUSPARSECopyFromGPU(X);CHKERRQ(ierr); 2587 } 2588 ierr = MatAXPY_SeqAIJ(Y,a,X,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 2589 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 2590 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 2591 } else { 2592 cublasHandle_t cublasv2handle; 2593 cublasStatus_t cberr; 2594 cudaError_t err; 2595 PetscScalar alpha = a; 2596 PetscBLASInt one = 1, bnz = 1; 2597 CsrMatrix *matrix_y = (CsrMatrix*)cusparsestruct_y->mat->mat; 2598 CsrMatrix *matrix_x = (CsrMatrix*)cusparsestruct_x->mat->mat; 2599 PetscScalar *aa_y, *aa_x; 2600 aa_y = matrix_y->values->data().get(); 2601 aa_x = matrix_x->values->data().get(); 2602 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2603 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 2604 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2605 cberr = cublasXaxpy(cublasv2handle,bnz,&alpha,aa_x,one,aa_y,one);CHKERRCUBLAS(cberr); 2606 err = WaitForCUDA();CHKERRCUDA(err); 2607 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 2608 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2609 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 2610 ierr = PetscObjectStateIncrease((PetscObject)Y);CHKERRQ(ierr); 2611 if (Y->offloadmask == PETSC_OFFLOAD_BOTH) Y->offloadmask = PETSC_OFFLOAD_GPU; 2612 else if (Y->offloadmask != PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"wrong state"); 2613 ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr); 2614 } 2615 } 2616 PetscFunctionReturn(0); 2617 } 2618 2619 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 2620 { 2621 PetscErrorCode ierr; 2622 PetscBool both = PETSC_FALSE; 2623 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2624 2625 PetscFunctionBegin; 2626 if (A->factortype == MAT_FACTOR_NONE) { 2627 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 2628 if (spptr->mat) { 2629 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 2630 if (matrix->values) { 2631 both = PETSC_TRUE; 2632 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 2633 } 2634 } 2635 if (spptr->matTranspose) { 2636 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 2637 if (matrix->values) { 2638 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 2639 } 2640 } 2641 } 2642 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 2643 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 2644 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 2645 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2646 else A->offloadmask = PETSC_OFFLOAD_CPU; 2647 2648 PetscFunctionReturn(0); 2649 } 2650 2651 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 2652 { 2653 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2654 PetscErrorCode ierr; 2655 2656 PetscFunctionBegin; 2657 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 2658 if (flg) { 2659 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 2660 2661 A->ops->axpy = MatAXPY_SeqAIJ; 2662 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 2663 A->ops->mult = MatMult_SeqAIJ; 2664 A->ops->multadd = MatMultAdd_SeqAIJ; 2665 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 2666 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 2667 A->ops->multhermitiantranspose = NULL; 2668 A->ops->multhermitiantransposeadd = NULL; 2669 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2670 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 2671 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 2672 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 2673 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 2674 } else { 2675 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 2676 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 2677 A->ops->mult = MatMult_SeqAIJCUSPARSE; 2678 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 2679 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 2680 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 2681 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 2682 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 2683 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2684 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2685 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 2686 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 2687 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 2688 } 2689 A->boundtocpu = flg; 2690 a->inode.use = flg; 2691 PetscFunctionReturn(0); 2692 } 2693 2694 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 2695 { 2696 PetscErrorCode ierr; 2697 cusparseStatus_t stat; 2698 Mat B; 2699 2700 PetscFunctionBegin; 2701 if (reuse == MAT_INITIAL_MATRIX) { 2702 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 2703 } else if (reuse == MAT_REUSE_MATRIX) { 2704 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 2705 } 2706 B = *newmat; 2707 2708 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 2709 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 2710 2711 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 2712 if (B->factortype == MAT_FACTOR_NONE) { 2713 Mat_SeqAIJCUSPARSE *spptr; 2714 2715 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2716 spptr->format = MAT_CUSPARSE_CSR; 2717 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2718 B->spptr = spptr; 2719 spptr->deviceMat = NULL; 2720 } else { 2721 Mat_SeqAIJCUSPARSETriFactors *spptr; 2722 2723 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2724 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2725 B->spptr = spptr; 2726 } 2727 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2728 } 2729 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 2730 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 2731 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 2732 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 2733 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 2734 2735 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 2736 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2737 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 2738 PetscFunctionReturn(0); 2739 } 2740 2741 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 2742 { 2743 PetscErrorCode ierr; 2744 2745 PetscFunctionBegin; 2746 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 2747 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 2748 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2749 ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr); 2750 ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr); 2751 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2752 PetscFunctionReturn(0); 2753 } 2754 2755 /*MC 2756 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 2757 2758 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 2759 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 2760 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 2761 2762 Options Database Keys: 2763 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 2764 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2765 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2766 2767 Level: beginner 2768 2769 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 2770 M*/ 2771 2772 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 2773 2774 2775 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 2776 { 2777 PetscErrorCode ierr; 2778 2779 PetscFunctionBegin; 2780 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2781 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2782 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2783 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2784 PetscFunctionReturn(0); 2785 } 2786 2787 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 2788 { 2789 PetscErrorCode ierr; 2790 cusparseStatus_t stat; 2791 2792 PetscFunctionBegin; 2793 if (*cusparsestruct) { 2794 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 2795 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 2796 delete (*cusparsestruct)->workVector; 2797 delete (*cusparsestruct)->rowoffsets_gpu; 2798 delete (*cusparsestruct)->cooPerm; 2799 delete (*cusparsestruct)->cooPerm_a; 2800 delete (*cusparsestruct)->cooPerm_v; 2801 delete (*cusparsestruct)->cooPerm_w; 2802 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 2803 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2804 cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr); 2805 #endif 2806 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 2807 } 2808 PetscFunctionReturn(0); 2809 } 2810 2811 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 2812 { 2813 PetscFunctionBegin; 2814 if (*mat) { 2815 delete (*mat)->values; 2816 delete (*mat)->column_indices; 2817 delete (*mat)->row_offsets; 2818 delete *mat; 2819 *mat = 0; 2820 } 2821 PetscFunctionReturn(0); 2822 } 2823 2824 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 2825 { 2826 cusparseStatus_t stat; 2827 PetscErrorCode ierr; 2828 2829 PetscFunctionBegin; 2830 if (*trifactor) { 2831 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 2832 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 2833 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 2834 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 2835 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 2836 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2837 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 2838 #endif 2839 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 2840 } 2841 PetscFunctionReturn(0); 2842 } 2843 2844 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 2845 { 2846 CsrMatrix *mat; 2847 cusparseStatus_t stat; 2848 cudaError_t err; 2849 2850 PetscFunctionBegin; 2851 if (*matstruct) { 2852 if ((*matstruct)->mat) { 2853 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 2854 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2855 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2856 #else 2857 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 2858 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 2859 #endif 2860 } else { 2861 mat = (CsrMatrix*)(*matstruct)->mat; 2862 CsrMatrix_Destroy(&mat); 2863 } 2864 } 2865 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 2866 delete (*matstruct)->cprowIndices; 2867 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 2868 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 2869 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 2870 2871 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2872 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 2873 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 2874 for (int i=0; i<3; i++) { 2875 if (mdata->cuSpMV[i].initialized) { 2876 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 2877 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 2878 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 2879 } 2880 } 2881 #endif 2882 delete *matstruct; 2883 *matstruct = NULL; 2884 } 2885 PetscFunctionReturn(0); 2886 } 2887 2888 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2889 { 2890 PetscErrorCode ierr; 2891 2892 PetscFunctionBegin; 2893 if (*trifactors) { 2894 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 2895 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 2896 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 2897 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 2898 delete (*trifactors)->rpermIndices; 2899 delete (*trifactors)->cpermIndices; 2900 delete (*trifactors)->workVector; 2901 (*trifactors)->rpermIndices = NULL; 2902 (*trifactors)->cpermIndices = NULL; 2903 (*trifactors)->workVector = NULL; 2904 } 2905 PetscFunctionReturn(0); 2906 } 2907 2908 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2909 { 2910 PetscErrorCode ierr; 2911 cusparseHandle_t handle; 2912 cusparseStatus_t stat; 2913 2914 PetscFunctionBegin; 2915 if (*trifactors) { 2916 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 2917 if (handle = (*trifactors)->handle) { 2918 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 2919 } 2920 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 2921 } 2922 PetscFunctionReturn(0); 2923 } 2924 2925 struct IJCompare 2926 { 2927 __host__ __device__ 2928 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 2929 { 2930 if (t1.get<0>() < t2.get<0>()) return true; 2931 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 2932 return false; 2933 } 2934 }; 2935 2936 struct IJEqual 2937 { 2938 __host__ __device__ 2939 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 2940 { 2941 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 2942 return true; 2943 } 2944 }; 2945 2946 struct IJDiff 2947 { 2948 __host__ __device__ 2949 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 2950 { 2951 return t1 == t2 ? 0 : 1; 2952 } 2953 }; 2954 2955 struct IJSum 2956 { 2957 __host__ __device__ 2958 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 2959 { 2960 return t1||t2; 2961 } 2962 }; 2963 2964 #include <thrust/iterator/discard_iterator.h> 2965 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 2966 { 2967 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2968 CsrMatrix *matrix; 2969 PetscErrorCode ierr; 2970 cudaError_t cerr; 2971 PetscInt n; 2972 2973 PetscFunctionBegin; 2974 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 2975 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 2976 if (!cusp->cooPerm) { 2977 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2978 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2979 PetscFunctionReturn(0); 2980 } 2981 n = cusp->cooPerm->size(); 2982 matrix = (CsrMatrix*)cusp->mat->mat; 2983 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 2984 if (!cusp->cooPerm_v) { cusp->cooPerm_v = new THRUSTARRAY(n); } 2985 ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 2986 if (v) { 2987 cusp->cooPerm_v->assign(v,v+n); 2988 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 2989 } 2990 else thrust::fill(thrust::device,cusp->cooPerm_v->begin(),cusp->cooPerm_v->end(),0.); 2991 if (imode == ADD_VALUES) { 2992 if (cusp->cooPerm_a) { 2993 if (!cusp->cooPerm_w) cusp->cooPerm_w = new THRUSTARRAY(matrix->values->size()); 2994 auto vbit = thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()); 2995 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cusp->cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 2996 thrust::transform(cusp->cooPerm_w->begin(),cusp->cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 2997 } else { 2998 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 2999 matrix->values->begin())); 3000 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 3001 matrix->values->end())); 3002 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 3003 } 3004 } else { 3005 if (cusp->cooPerm_a) { /* non unique values insertion, result is undefined (we cannot guarantee last takes precedence) 3006 if we are inserting two different values into the same location */ 3007 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 3008 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->begin()))); 3009 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 3010 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->end()))); 3011 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3012 } else { 3013 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 3014 matrix->values->begin())); 3015 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 3016 matrix->values->end())); 3017 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3018 } 3019 } 3020 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3021 ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 3022 A->offloadmask = PETSC_OFFLOAD_GPU; 3023 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3024 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3025 /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */ 3026 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3027 PetscFunctionReturn(0); 3028 } 3029 3030 #include <thrust/binary_search.h> 3031 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3032 { 3033 PetscErrorCode ierr; 3034 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3035 CsrMatrix *matrix; 3036 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3037 PetscInt cooPerm_n, nzr = 0; 3038 cudaError_t cerr; 3039 3040 PetscFunctionBegin; 3041 ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr); 3042 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3043 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3044 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3045 if (n != cooPerm_n) { 3046 delete cusp->cooPerm; 3047 delete cusp->cooPerm_v; 3048 delete cusp->cooPerm_w; 3049 delete cusp->cooPerm_a; 3050 cusp->cooPerm = NULL; 3051 cusp->cooPerm_v = NULL; 3052 cusp->cooPerm_w = NULL; 3053 cusp->cooPerm_a = NULL; 3054 } 3055 if (n) { 3056 THRUSTINTARRAY d_i(n); 3057 THRUSTINTARRAY d_j(n); 3058 THRUSTINTARRAY ii(A->rmap->n); 3059 3060 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3061 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3062 3063 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3064 d_i.assign(coo_i,coo_i+n); 3065 d_j.assign(coo_j,coo_j+n); 3066 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3067 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3068 3069 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3070 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 3071 *cusp->cooPerm_a = d_i; 3072 THRUSTINTARRAY w = d_j; 3073 3074 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 3075 if (nekey == ekey) { /* all entries are unique */ 3076 delete cusp->cooPerm_a; 3077 cusp->cooPerm_a = NULL; 3078 } else { /* I couldn't come up with a more elegant algorithm */ 3079 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 3080 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 3081 (*cusp->cooPerm_a)[0] = 0; 3082 w[0] = 0; 3083 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 3084 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 3085 } 3086 thrust::counting_iterator<PetscInt> search_begin(0); 3087 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 3088 search_begin, search_begin + A->rmap->n, 3089 ii.begin()); 3090 3091 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3092 a->singlemalloc = PETSC_FALSE; 3093 a->free_a = PETSC_TRUE; 3094 a->free_ij = PETSC_TRUE; 3095 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3096 a->i[0] = 0; 3097 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3098 a->nz = a->maxnz = a->i[A->rmap->n]; 3099 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3100 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3101 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3102 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3103 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3104 for (PetscInt i = 0; i < A->rmap->n; i++) { 3105 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3106 nzr += (PetscInt)!!(nnzr); 3107 a->ilen[i] = a->imax[i] = nnzr; 3108 } 3109 A->preallocated = PETSC_TRUE; 3110 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3111 } else { 3112 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3113 } 3114 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3115 ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr); 3116 3117 /* We want to allocate the CUSPARSE struct for matvec now. 3118 The code is so convoluted now that I prefer to copy garbage to the GPU */ 3119 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3120 A->offloadmask = PETSC_OFFLOAD_CPU; 3121 A->nonzerostate++; 3122 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3123 { 3124 matrix = (CsrMatrix*)cusp->mat->mat; 3125 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3126 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3127 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3128 } 3129 3130 A->offloadmask = PETSC_OFFLOAD_CPU; 3131 A->assembled = PETSC_FALSE; 3132 A->was_assembled = PETSC_FALSE; 3133 PetscFunctionReturn(0); 3134 } 3135