1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 76 77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 81 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 82 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 83 84 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 85 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 86 87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 89 90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 91 92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 93 { 94 cusparseStatus_t stat; 95 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 96 97 PetscFunctionBegin; 98 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 99 cusparsestruct->stream = stream; 100 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 101 PetscFunctionReturn(0); 102 } 103 104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 105 { 106 cusparseStatus_t stat; 107 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 108 109 PetscFunctionBegin; 110 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 111 if (cusparsestruct->handle != handle) { 112 if (cusparsestruct->handle) { 113 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 114 } 115 cusparsestruct->handle = handle; 116 } 117 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 118 PetscFunctionReturn(0); 119 } 120 121 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 122 { 123 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 124 PetscBool flg; 125 PetscErrorCode ierr; 126 127 PetscFunctionBegin; 128 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 129 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 130 if (cusparsestruct->handle) cusparsestruct->handle = 0; 131 PetscFunctionReturn(0); 132 } 133 134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 135 { 136 PetscFunctionBegin; 137 *type = MATSOLVERCUSPARSE; 138 PetscFunctionReturn(0); 139 } 140 141 /*MC 142 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 143 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 144 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 145 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 146 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 147 algorithms are not recommended. This class does NOT support direct solver operations. 148 149 Level: beginner 150 151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 152 M*/ 153 154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 155 { 156 PetscErrorCode ierr; 157 PetscInt n = A->rmap->n; 158 159 PetscFunctionBegin; 160 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 161 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 162 (*B)->factortype = ftype; 163 (*B)->useordering = PETSC_TRUE; 164 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 165 166 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 167 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 168 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 169 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 170 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 171 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 172 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 173 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 174 175 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 176 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 177 PetscFunctionReturn(0); 178 } 179 180 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 181 { 182 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 183 184 PetscFunctionBegin; 185 switch (op) { 186 case MAT_CUSPARSE_MULT: 187 cusparsestruct->format = format; 188 break; 189 case MAT_CUSPARSE_ALL: 190 cusparsestruct->format = format; 191 break; 192 default: 193 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 194 } 195 PetscFunctionReturn(0); 196 } 197 198 /*@ 199 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 200 operation. Only the MatMult operation can use different GPU storage formats 201 for MPIAIJCUSPARSE matrices. 202 Not Collective 203 204 Input Parameters: 205 + A - Matrix of type SEQAIJCUSPARSE 206 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 207 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 208 209 Output Parameter: 210 211 Level: intermediate 212 213 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 214 @*/ 215 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 216 { 217 PetscErrorCode ierr; 218 219 PetscFunctionBegin; 220 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 221 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 222 PetscFunctionReturn(0); 223 } 224 225 /*@ 226 MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose 227 228 Collective on mat 229 230 Input Parameters: 231 + A - Matrix of type SEQAIJCUSPARSE 232 - transgen - the boolean flag 233 234 Level: intermediate 235 236 .seealso: MATSEQAIJCUSPARSE, MatAIJCUSPARSESetGenerateTranspose() 237 @*/ 238 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 239 { 240 PetscErrorCode ierr; 241 PetscBool flg; 242 243 PetscFunctionBegin; 244 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 245 ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 246 if (flg) { 247 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 248 249 if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 250 cusp->transgen = transgen; 251 if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 252 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 253 } 254 } 255 PetscFunctionReturn(0); 256 } 257 258 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 259 { 260 PetscErrorCode ierr; 261 MatCUSPARSEStorageFormat format; 262 PetscBool flg; 263 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 264 265 PetscFunctionBegin; 266 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 267 if (A->factortype == MAT_FACTOR_NONE) { 268 PetscBool transgen = cusparsestruct->transgen; 269 270 ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 271 if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);} 272 273 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 274 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 275 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 276 277 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 278 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 279 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 280 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 281 cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 282 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 283 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 284 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 285 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 286 287 cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 288 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 289 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 290 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 291 292 cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 293 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 294 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 295 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 296 #endif 297 } 298 ierr = PetscOptionsTail();CHKERRQ(ierr); 299 PetscFunctionReturn(0); 300 } 301 302 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 303 { 304 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 305 PetscErrorCode ierr; 306 307 PetscFunctionBegin; 308 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 309 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 310 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 311 PetscFunctionReturn(0); 312 } 313 314 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 315 { 316 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 317 PetscErrorCode ierr; 318 319 PetscFunctionBegin; 320 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 321 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 322 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 323 PetscFunctionReturn(0); 324 } 325 326 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 327 { 328 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 329 PetscErrorCode ierr; 330 331 PetscFunctionBegin; 332 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 333 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 334 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 335 PetscFunctionReturn(0); 336 } 337 338 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 339 { 340 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 341 PetscErrorCode ierr; 342 343 PetscFunctionBegin; 344 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 345 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 346 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 347 PetscFunctionReturn(0); 348 } 349 350 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 351 { 352 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 353 PetscInt n = A->rmap->n; 354 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 355 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 356 cusparseStatus_t stat; 357 const PetscInt *ai = a->i,*aj = a->j,*vi; 358 const MatScalar *aa = a->a,*v; 359 PetscInt *AiLo, *AjLo; 360 PetscInt i,nz, nzLower, offset, rowOffset; 361 PetscErrorCode ierr; 362 cudaError_t cerr; 363 364 PetscFunctionBegin; 365 if (!n) PetscFunctionReturn(0); 366 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 367 try { 368 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 369 nzLower=n+ai[n]-ai[1]; 370 if (!loTriFactor) { 371 PetscScalar *AALo; 372 373 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 374 375 /* Allocate Space for the lower triangular matrix */ 376 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 377 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 378 379 /* Fill the lower triangular matrix */ 380 AiLo[0] = (PetscInt) 0; 381 AiLo[n] = nzLower; 382 AjLo[0] = (PetscInt) 0; 383 AALo[0] = (MatScalar) 1.0; 384 v = aa; 385 vi = aj; 386 offset = 1; 387 rowOffset= 1; 388 for (i=1; i<n; i++) { 389 nz = ai[i+1] - ai[i]; 390 /* additional 1 for the term on the diagonal */ 391 AiLo[i] = rowOffset; 392 rowOffset += nz+1; 393 394 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 395 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 396 397 offset += nz; 398 AjLo[offset] = (PetscInt) i; 399 AALo[offset] = (MatScalar) 1.0; 400 offset += 1; 401 402 v += nz; 403 vi += nz; 404 } 405 406 /* allocate space for the triangular factor information */ 407 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 408 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 409 /* Create the matrix description */ 410 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 411 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 412 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 413 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 414 #else 415 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 416 #endif 417 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 418 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 419 420 /* set the operation */ 421 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 422 423 /* set the matrix */ 424 loTriFactor->csrMat = new CsrMatrix; 425 loTriFactor->csrMat->num_rows = n; 426 loTriFactor->csrMat->num_cols = n; 427 loTriFactor->csrMat->num_entries = nzLower; 428 429 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 430 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 431 432 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 433 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 434 435 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 436 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 437 438 /* Create the solve analysis information */ 439 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 440 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 441 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 442 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 443 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 444 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 445 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 446 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 447 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 448 #endif 449 450 /* perform the solve analysis */ 451 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 452 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 453 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 454 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 455 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 456 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 457 #endif 458 );CHKERRCUSPARSE(stat); 459 cerr = WaitForCUDA();CHKERRCUDA(cerr); 460 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 461 462 /* assign the pointer */ 463 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 464 loTriFactor->AA_h = AALo; 465 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 466 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 467 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 468 } else { /* update values only */ 469 if (!loTriFactor->AA_h) { 470 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 471 } 472 /* Fill the lower triangular matrix */ 473 loTriFactor->AA_h[0] = 1.0; 474 v = aa; 475 vi = aj; 476 offset = 1; 477 for (i=1; i<n; i++) { 478 nz = ai[i+1] - ai[i]; 479 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 480 offset += nz; 481 loTriFactor->AA_h[offset] = 1.0; 482 offset += 1; 483 v += nz; 484 } 485 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 486 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 487 } 488 } catch(char *ex) { 489 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 490 } 491 } 492 PetscFunctionReturn(0); 493 } 494 495 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 496 { 497 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 498 PetscInt n = A->rmap->n; 499 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 500 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 501 cusparseStatus_t stat; 502 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 503 const MatScalar *aa = a->a,*v; 504 PetscInt *AiUp, *AjUp; 505 PetscInt i,nz, nzUpper, offset; 506 PetscErrorCode ierr; 507 cudaError_t cerr; 508 509 PetscFunctionBegin; 510 if (!n) PetscFunctionReturn(0); 511 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 512 try { 513 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 514 nzUpper = adiag[0]-adiag[n]; 515 if (!upTriFactor) { 516 PetscScalar *AAUp; 517 518 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 519 520 /* Allocate Space for the upper triangular matrix */ 521 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 522 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 523 524 /* Fill the upper triangular matrix */ 525 AiUp[0]=(PetscInt) 0; 526 AiUp[n]=nzUpper; 527 offset = nzUpper; 528 for (i=n-1; i>=0; i--) { 529 v = aa + adiag[i+1] + 1; 530 vi = aj + adiag[i+1] + 1; 531 532 /* number of elements NOT on the diagonal */ 533 nz = adiag[i] - adiag[i+1]-1; 534 535 /* decrement the offset */ 536 offset -= (nz+1); 537 538 /* first, set the diagonal elements */ 539 AjUp[offset] = (PetscInt) i; 540 AAUp[offset] = (MatScalar)1./v[nz]; 541 AiUp[i] = AiUp[i+1] - (nz+1); 542 543 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 544 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 545 } 546 547 /* allocate space for the triangular factor information */ 548 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 549 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 550 551 /* Create the matrix description */ 552 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 553 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 554 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 555 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 556 #else 557 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 558 #endif 559 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 560 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 561 562 /* set the operation */ 563 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 564 565 /* set the matrix */ 566 upTriFactor->csrMat = new CsrMatrix; 567 upTriFactor->csrMat->num_rows = n; 568 upTriFactor->csrMat->num_cols = n; 569 upTriFactor->csrMat->num_entries = nzUpper; 570 571 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 572 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 573 574 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 575 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 576 577 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 578 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 579 580 /* Create the solve analysis information */ 581 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 582 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 583 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 584 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 585 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 586 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 587 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 588 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 589 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 590 #endif 591 592 /* perform the solve analysis */ 593 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 594 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 595 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 596 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 597 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 598 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 599 #endif 600 );CHKERRCUSPARSE(stat); 601 cerr = WaitForCUDA();CHKERRCUDA(cerr); 602 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 603 604 /* assign the pointer */ 605 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 606 upTriFactor->AA_h = AAUp; 607 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 608 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 609 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 610 } else { 611 if (!upTriFactor->AA_h) { 612 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 613 } 614 /* Fill the upper triangular matrix */ 615 offset = nzUpper; 616 for (i=n-1; i>=0; i--) { 617 v = aa + adiag[i+1] + 1; 618 619 /* number of elements NOT on the diagonal */ 620 nz = adiag[i] - adiag[i+1]-1; 621 622 /* decrement the offset */ 623 offset -= (nz+1); 624 625 /* first, set the diagonal elements */ 626 upTriFactor->AA_h[offset] = 1./v[nz]; 627 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 628 } 629 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 630 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 631 } 632 } catch(char *ex) { 633 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 634 } 635 } 636 PetscFunctionReturn(0); 637 } 638 639 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 640 { 641 PetscErrorCode ierr; 642 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 643 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 644 IS isrow = a->row,iscol = a->icol; 645 PetscBool row_identity,col_identity; 646 PetscInt n = A->rmap->n; 647 648 PetscFunctionBegin; 649 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 650 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 651 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 652 653 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 654 cusparseTriFactors->nnz=a->nz; 655 656 A->offloadmask = PETSC_OFFLOAD_BOTH; 657 /* lower triangular indices */ 658 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 659 if (!row_identity && !cusparseTriFactors->rpermIndices) { 660 const PetscInt *r; 661 662 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 663 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 664 cusparseTriFactors->rpermIndices->assign(r, r+n); 665 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 666 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 667 } 668 669 /* upper triangular indices */ 670 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 671 if (!col_identity && !cusparseTriFactors->cpermIndices) { 672 const PetscInt *c; 673 674 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 675 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 676 cusparseTriFactors->cpermIndices->assign(c, c+n); 677 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 678 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 679 } 680 PetscFunctionReturn(0); 681 } 682 683 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 684 { 685 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 686 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 687 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 688 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 689 cusparseStatus_t stat; 690 PetscErrorCode ierr; 691 cudaError_t cerr; 692 PetscInt *AiUp, *AjUp; 693 PetscScalar *AAUp; 694 PetscScalar *AALo; 695 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 696 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 697 const PetscInt *ai = b->i,*aj = b->j,*vj; 698 const MatScalar *aa = b->a,*v; 699 700 PetscFunctionBegin; 701 if (!n) PetscFunctionReturn(0); 702 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 703 try { 704 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 705 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 706 if (!upTriFactor && !loTriFactor) { 707 /* Allocate Space for the upper triangular matrix */ 708 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 709 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 710 711 /* Fill the upper triangular matrix */ 712 AiUp[0]=(PetscInt) 0; 713 AiUp[n]=nzUpper; 714 offset = 0; 715 for (i=0; i<n; i++) { 716 /* set the pointers */ 717 v = aa + ai[i]; 718 vj = aj + ai[i]; 719 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 720 721 /* first, set the diagonal elements */ 722 AjUp[offset] = (PetscInt) i; 723 AAUp[offset] = (MatScalar)1.0/v[nz]; 724 AiUp[i] = offset; 725 AALo[offset] = (MatScalar)1.0/v[nz]; 726 727 offset+=1; 728 if (nz>0) { 729 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 730 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 731 for (j=offset; j<offset+nz; j++) { 732 AAUp[j] = -AAUp[j]; 733 AALo[j] = AAUp[j]/v[nz]; 734 } 735 offset+=nz; 736 } 737 } 738 739 /* allocate space for the triangular factor information */ 740 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 741 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 742 743 /* Create the matrix description */ 744 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 745 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 746 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 747 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 748 #else 749 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 750 #endif 751 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 752 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 753 754 /* set the matrix */ 755 upTriFactor->csrMat = new CsrMatrix; 756 upTriFactor->csrMat->num_rows = A->rmap->n; 757 upTriFactor->csrMat->num_cols = A->cmap->n; 758 upTriFactor->csrMat->num_entries = a->nz; 759 760 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 761 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 762 763 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 764 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 765 766 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 767 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 768 769 /* set the operation */ 770 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 771 772 /* Create the solve analysis information */ 773 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 774 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 775 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 776 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 777 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 778 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 779 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 780 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 781 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 782 #endif 783 784 /* perform the solve analysis */ 785 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 786 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 787 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 788 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 789 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 790 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 791 #endif 792 );CHKERRCUSPARSE(stat); 793 cerr = WaitForCUDA();CHKERRCUDA(cerr); 794 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 795 796 /* assign the pointer */ 797 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 798 799 /* allocate space for the triangular factor information */ 800 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 801 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 802 803 /* Create the matrix description */ 804 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 805 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 806 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 807 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 808 #else 809 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 810 #endif 811 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 812 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 813 814 /* set the operation */ 815 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 816 817 /* set the matrix */ 818 loTriFactor->csrMat = new CsrMatrix; 819 loTriFactor->csrMat->num_rows = A->rmap->n; 820 loTriFactor->csrMat->num_cols = A->cmap->n; 821 loTriFactor->csrMat->num_entries = a->nz; 822 823 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 824 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 825 826 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 827 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 828 829 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 830 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 831 832 /* Create the solve analysis information */ 833 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 834 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 835 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 836 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 837 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 838 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 839 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 840 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 841 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 842 #endif 843 844 /* perform the solve analysis */ 845 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 846 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 847 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 848 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 849 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 850 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 851 #endif 852 );CHKERRCUSPARSE(stat); 853 cerr = WaitForCUDA();CHKERRCUDA(cerr); 854 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 855 856 /* assign the pointer */ 857 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 858 859 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 860 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 861 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 862 } else { 863 /* Fill the upper triangular matrix */ 864 offset = 0; 865 for (i=0; i<n; i++) { 866 /* set the pointers */ 867 v = aa + ai[i]; 868 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 869 870 /* first, set the diagonal elements */ 871 AAUp[offset] = 1.0/v[nz]; 872 AALo[offset] = 1.0/v[nz]; 873 874 offset+=1; 875 if (nz>0) { 876 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 877 for (j=offset; j<offset+nz; j++) { 878 AAUp[j] = -AAUp[j]; 879 AALo[j] = AAUp[j]/v[nz]; 880 } 881 offset+=nz; 882 } 883 } 884 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 885 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 886 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 887 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 888 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 889 } 890 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 891 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 892 } catch(char *ex) { 893 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 894 } 895 } 896 PetscFunctionReturn(0); 897 } 898 899 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 900 { 901 PetscErrorCode ierr; 902 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 903 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 904 IS ip = a->row; 905 PetscBool perm_identity; 906 PetscInt n = A->rmap->n; 907 908 PetscFunctionBegin; 909 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 910 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 911 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 912 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 913 914 A->offloadmask = PETSC_OFFLOAD_BOTH; 915 916 /* lower triangular indices */ 917 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 918 if (!perm_identity) { 919 IS iip; 920 const PetscInt *irip,*rip; 921 922 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 923 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 924 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 925 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 926 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 927 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 928 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 929 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 930 ierr = ISDestroy(&iip);CHKERRQ(ierr); 931 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 932 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 933 } 934 PetscFunctionReturn(0); 935 } 936 937 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 938 { 939 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 940 IS isrow = b->row,iscol = b->col; 941 PetscBool row_identity,col_identity; 942 PetscErrorCode ierr; 943 944 PetscFunctionBegin; 945 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 946 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 947 B->offloadmask = PETSC_OFFLOAD_CPU; 948 /* determine which version of MatSolve needs to be used. */ 949 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 950 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 951 if (row_identity && col_identity) { 952 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 953 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 954 B->ops->matsolve = NULL; 955 B->ops->matsolvetranspose = NULL; 956 } else { 957 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 958 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 959 B->ops->matsolve = NULL; 960 B->ops->matsolvetranspose = NULL; 961 } 962 963 /* get the triangular factors */ 964 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 965 PetscFunctionReturn(0); 966 } 967 968 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 969 { 970 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 971 IS ip = b->row; 972 PetscBool perm_identity; 973 PetscErrorCode ierr; 974 975 PetscFunctionBegin; 976 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 977 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 978 B->offloadmask = PETSC_OFFLOAD_CPU; 979 /* determine which version of MatSolve needs to be used. */ 980 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 981 if (perm_identity) { 982 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 983 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 984 B->ops->matsolve = NULL; 985 B->ops->matsolvetranspose = NULL; 986 } else { 987 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 988 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 989 B->ops->matsolve = NULL; 990 B->ops->matsolvetranspose = NULL; 991 } 992 993 /* get the triangular factors */ 994 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 995 PetscFunctionReturn(0); 996 } 997 998 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 999 { 1000 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1001 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1002 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1003 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1004 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1005 cusparseStatus_t stat; 1006 cusparseIndexBase_t indexBase; 1007 cusparseMatrixType_t matrixType; 1008 cusparseFillMode_t fillMode; 1009 cusparseDiagType_t diagType; 1010 cudaError_t cerr; 1011 PetscErrorCode ierr; 1012 1013 PetscFunctionBegin; 1014 /* allocate space for the transpose of the lower triangular factor */ 1015 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1016 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1017 1018 /* set the matrix descriptors of the lower triangular factor */ 1019 matrixType = cusparseGetMatType(loTriFactor->descr); 1020 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1021 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1022 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1023 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1024 1025 /* Create the matrix description */ 1026 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1027 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1028 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1029 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1030 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1031 1032 /* set the operation */ 1033 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1034 1035 /* allocate GPU space for the CSC of the lower triangular factor*/ 1036 loTriFactorT->csrMat = new CsrMatrix; 1037 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1038 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1039 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1040 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1041 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1042 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1043 1044 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1045 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1046 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1047 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1048 loTriFactor->csrMat->values->data().get(), 1049 loTriFactor->csrMat->row_offsets->data().get(), 1050 loTriFactor->csrMat->column_indices->data().get(), 1051 loTriFactorT->csrMat->values->data().get(), 1052 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1053 CUSPARSE_ACTION_NUMERIC,indexBase, 1054 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1055 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1056 #endif 1057 1058 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1059 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1060 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1061 loTriFactor->csrMat->values->data().get(), 1062 loTriFactor->csrMat->row_offsets->data().get(), 1063 loTriFactor->csrMat->column_indices->data().get(), 1064 loTriFactorT->csrMat->values->data().get(), 1065 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1066 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1067 CUSPARSE_ACTION_NUMERIC, indexBase, 1068 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 1069 #else 1070 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1071 CUSPARSE_ACTION_NUMERIC, indexBase 1072 #endif 1073 );CHKERRCUSPARSE(stat); 1074 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1075 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1076 1077 /* Create the solve analysis information */ 1078 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1079 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1080 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1081 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1082 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1083 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1084 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1085 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1086 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1087 #endif 1088 1089 /* perform the solve analysis */ 1090 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1091 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1092 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1093 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 1094 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1095 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1096 #endif 1097 );CHKERRCUSPARSE(stat); 1098 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1099 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1100 1101 /* assign the pointer */ 1102 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1103 1104 /*********************************************/ 1105 /* Now the Transpose of the Upper Tri Factor */ 1106 /*********************************************/ 1107 1108 /* allocate space for the transpose of the upper triangular factor */ 1109 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1110 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1111 1112 /* set the matrix descriptors of the upper triangular factor */ 1113 matrixType = cusparseGetMatType(upTriFactor->descr); 1114 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1115 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1116 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1117 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1118 1119 /* Create the matrix description */ 1120 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1121 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1122 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1123 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1124 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1125 1126 /* set the operation */ 1127 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1128 1129 /* allocate GPU space for the CSC of the upper triangular factor*/ 1130 upTriFactorT->csrMat = new CsrMatrix; 1131 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1132 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1133 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1134 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1135 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1136 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1137 1138 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1139 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1140 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1141 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1142 upTriFactor->csrMat->values->data().get(), 1143 upTriFactor->csrMat->row_offsets->data().get(), 1144 upTriFactor->csrMat->column_indices->data().get(), 1145 upTriFactorT->csrMat->values->data().get(), 1146 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1147 CUSPARSE_ACTION_NUMERIC,indexBase, 1148 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1149 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1150 #endif 1151 1152 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1153 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1154 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1155 upTriFactor->csrMat->values->data().get(), 1156 upTriFactor->csrMat->row_offsets->data().get(), 1157 upTriFactor->csrMat->column_indices->data().get(), 1158 upTriFactorT->csrMat->values->data().get(), 1159 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1160 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1161 CUSPARSE_ACTION_NUMERIC, indexBase, 1162 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1163 #else 1164 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1165 CUSPARSE_ACTION_NUMERIC, indexBase 1166 #endif 1167 );CHKERRCUSPARSE(stat); 1168 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1169 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1170 1171 /* Create the solve analysis information */ 1172 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1173 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1174 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1175 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1176 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1177 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1178 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1179 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1180 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1181 #endif 1182 1183 /* perform the solve analysis */ 1184 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1185 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1186 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1187 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1188 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1189 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1190 #endif 1191 );CHKERRCUSPARSE(stat); 1192 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1193 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1194 1195 /* assign the pointer */ 1196 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1197 PetscFunctionReturn(0); 1198 } 1199 1200 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 1201 { 1202 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1203 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1204 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1205 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1206 cusparseStatus_t stat; 1207 cusparseIndexBase_t indexBase; 1208 cudaError_t err; 1209 PetscErrorCode ierr; 1210 1211 PetscFunctionBegin; 1212 if (!cusparsestruct->transgen || cusparsestruct->matTranspose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 1213 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1214 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1215 /* create cusparse matrix */ 1216 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1217 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1218 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1219 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1220 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1221 1222 /* set alpha and beta */ 1223 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1224 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1225 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1226 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1227 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1228 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1229 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1230 1231 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1232 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1233 CsrMatrix *matrixT= new CsrMatrix; 1234 matrixT->num_rows = A->cmap->n; 1235 matrixT->num_cols = A->rmap->n; 1236 matrixT->num_entries = a->nz; 1237 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1238 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1239 matrixT->values = new THRUSTARRAY(a->nz); 1240 1241 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1242 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1243 1244 /* compute the transpose, i.e. the CSC */ 1245 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1246 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1247 A->cmap->n, matrix->num_entries, 1248 matrix->values->data().get(), 1249 cusparsestruct->rowoffsets_gpu->data().get(), 1250 matrix->column_indices->data().get(), 1251 matrixT->values->data().get(), 1252 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1253 CUSPARSE_ACTION_NUMERIC,indexBase, 1254 cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1255 err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err); 1256 #endif 1257 1258 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1259 A->cmap->n, matrix->num_entries, 1260 matrix->values->data().get(), 1261 cusparsestruct->rowoffsets_gpu->data().get(), 1262 matrix->column_indices->data().get(), 1263 matrixT->values->data().get(), 1264 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1265 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1266 CUSPARSE_ACTION_NUMERIC,indexBase, 1267 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1268 #else 1269 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1270 CUSPARSE_ACTION_NUMERIC, indexBase 1271 #endif 1272 );CHKERRCUSPARSE(stat); 1273 matstructT->mat = matrixT; 1274 1275 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1276 stat = cusparseCreateCsr(&matstructT->matDescr, 1277 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1278 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1279 matrixT->values->data().get(), 1280 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1281 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1282 #endif 1283 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1284 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1285 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1286 #else 1287 CsrMatrix *temp = new CsrMatrix; 1288 CsrMatrix *tempT = new CsrMatrix; 1289 /* First convert HYB to CSR */ 1290 temp->num_rows = A->rmap->n; 1291 temp->num_cols = A->cmap->n; 1292 temp->num_entries = a->nz; 1293 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1294 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1295 temp->values = new THRUSTARRAY(a->nz); 1296 1297 stat = cusparse_hyb2csr(cusparsestruct->handle, 1298 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1299 temp->values->data().get(), 1300 temp->row_offsets->data().get(), 1301 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1302 1303 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1304 tempT->num_rows = A->rmap->n; 1305 tempT->num_cols = A->cmap->n; 1306 tempT->num_entries = a->nz; 1307 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1308 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1309 tempT->values = new THRUSTARRAY(a->nz); 1310 1311 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1312 temp->num_cols, temp->num_entries, 1313 temp->values->data().get(), 1314 temp->row_offsets->data().get(), 1315 temp->column_indices->data().get(), 1316 tempT->values->data().get(), 1317 tempT->column_indices->data().get(), 1318 tempT->row_offsets->data().get(), 1319 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1320 1321 /* Last, convert CSC to HYB */ 1322 cusparseHybMat_t hybMat; 1323 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1324 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1325 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1326 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1327 matstructT->descr, tempT->values->data().get(), 1328 tempT->row_offsets->data().get(), 1329 tempT->column_indices->data().get(), 1330 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1331 1332 /* assign the pointer */ 1333 matstructT->mat = hybMat; 1334 /* delete temporaries */ 1335 if (tempT) { 1336 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1337 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1338 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1339 delete (CsrMatrix*) tempT; 1340 } 1341 if (temp) { 1342 if (temp->values) delete (THRUSTARRAY*) temp->values; 1343 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1344 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1345 delete (CsrMatrix*) temp; 1346 } 1347 #endif 1348 } 1349 err = WaitForCUDA();CHKERRCUDA(err); 1350 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1351 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1352 /* the compressed row indices is not used for matTranspose */ 1353 matstructT->cprowIndices = NULL; 1354 /* assign the pointer */ 1355 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1356 PetscFunctionReturn(0); 1357 } 1358 1359 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1360 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1361 { 1362 PetscInt n = xx->map->n; 1363 const PetscScalar *barray; 1364 PetscScalar *xarray; 1365 thrust::device_ptr<const PetscScalar> bGPU; 1366 thrust::device_ptr<PetscScalar> xGPU; 1367 cusparseStatus_t stat; 1368 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1369 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1370 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1371 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1372 PetscErrorCode ierr; 1373 cudaError_t cerr; 1374 1375 PetscFunctionBegin; 1376 /* Analyze the matrix and create the transpose ... on the fly */ 1377 if (!loTriFactorT && !upTriFactorT) { 1378 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1379 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1380 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1381 } 1382 1383 /* Get the GPU pointers */ 1384 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1385 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1386 xGPU = thrust::device_pointer_cast(xarray); 1387 bGPU = thrust::device_pointer_cast(barray); 1388 1389 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1390 /* First, reorder with the row permutation */ 1391 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1392 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1393 xGPU); 1394 1395 /* First, solve U */ 1396 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1397 upTriFactorT->csrMat->num_rows, 1398 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1399 upTriFactorT->csrMat->num_entries, 1400 #endif 1401 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1402 upTriFactorT->csrMat->values->data().get(), 1403 upTriFactorT->csrMat->row_offsets->data().get(), 1404 upTriFactorT->csrMat->column_indices->data().get(), 1405 upTriFactorT->solveInfo, 1406 xarray, tempGPU->data().get() 1407 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1408 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1409 #endif 1410 );CHKERRCUSPARSE(stat); 1411 1412 /* Then, solve L */ 1413 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1414 loTriFactorT->csrMat->num_rows, 1415 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1416 loTriFactorT->csrMat->num_entries, 1417 #endif 1418 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1419 loTriFactorT->csrMat->values->data().get(), 1420 loTriFactorT->csrMat->row_offsets->data().get(), 1421 loTriFactorT->csrMat->column_indices->data().get(), 1422 loTriFactorT->solveInfo, 1423 tempGPU->data().get(), xarray 1424 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1425 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1426 #endif 1427 );CHKERRCUSPARSE(stat); 1428 1429 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1430 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1431 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1432 tempGPU->begin()); 1433 1434 /* Copy the temporary to the full solution. */ 1435 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1436 1437 /* restore */ 1438 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1439 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1440 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1441 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1442 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1443 PetscFunctionReturn(0); 1444 } 1445 1446 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1447 { 1448 const PetscScalar *barray; 1449 PetscScalar *xarray; 1450 cusparseStatus_t stat; 1451 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1452 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1453 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1454 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1455 PetscErrorCode ierr; 1456 cudaError_t cerr; 1457 1458 PetscFunctionBegin; 1459 /* Analyze the matrix and create the transpose ... on the fly */ 1460 if (!loTriFactorT && !upTriFactorT) { 1461 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1462 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1463 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1464 } 1465 1466 /* Get the GPU pointers */ 1467 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1468 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1469 1470 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1471 /* First, solve U */ 1472 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1473 upTriFactorT->csrMat->num_rows, 1474 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1475 upTriFactorT->csrMat->num_entries, 1476 #endif 1477 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1478 upTriFactorT->csrMat->values->data().get(), 1479 upTriFactorT->csrMat->row_offsets->data().get(), 1480 upTriFactorT->csrMat->column_indices->data().get(), 1481 upTriFactorT->solveInfo, 1482 barray, tempGPU->data().get() 1483 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1484 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1485 #endif 1486 );CHKERRCUSPARSE(stat); 1487 1488 /* Then, solve L */ 1489 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1490 loTriFactorT->csrMat->num_rows, 1491 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1492 loTriFactorT->csrMat->num_entries, 1493 #endif 1494 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1495 loTriFactorT->csrMat->values->data().get(), 1496 loTriFactorT->csrMat->row_offsets->data().get(), 1497 loTriFactorT->csrMat->column_indices->data().get(), 1498 loTriFactorT->solveInfo, 1499 tempGPU->data().get(), xarray 1500 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1501 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1502 #endif 1503 );CHKERRCUSPARSE(stat); 1504 1505 /* restore */ 1506 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1507 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1508 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1509 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1510 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1511 PetscFunctionReturn(0); 1512 } 1513 1514 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1515 { 1516 const PetscScalar *barray; 1517 PetscScalar *xarray; 1518 thrust::device_ptr<const PetscScalar> bGPU; 1519 thrust::device_ptr<PetscScalar> xGPU; 1520 cusparseStatus_t stat; 1521 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1522 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1523 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1524 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1525 PetscErrorCode ierr; 1526 cudaError_t cerr; 1527 1528 PetscFunctionBegin; 1529 1530 /* Get the GPU pointers */ 1531 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1532 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1533 xGPU = thrust::device_pointer_cast(xarray); 1534 bGPU = thrust::device_pointer_cast(barray); 1535 1536 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1537 /* First, reorder with the row permutation */ 1538 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1539 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1540 tempGPU->begin()); 1541 1542 /* Next, solve L */ 1543 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1544 loTriFactor->csrMat->num_rows, 1545 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1546 loTriFactor->csrMat->num_entries, 1547 #endif 1548 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1549 loTriFactor->csrMat->values->data().get(), 1550 loTriFactor->csrMat->row_offsets->data().get(), 1551 loTriFactor->csrMat->column_indices->data().get(), 1552 loTriFactor->solveInfo, 1553 tempGPU->data().get(), xarray 1554 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1555 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1556 #endif 1557 );CHKERRCUSPARSE(stat); 1558 1559 /* Then, solve U */ 1560 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1561 upTriFactor->csrMat->num_rows, 1562 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1563 upTriFactor->csrMat->num_entries, 1564 #endif 1565 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1566 upTriFactor->csrMat->values->data().get(), 1567 upTriFactor->csrMat->row_offsets->data().get(), 1568 upTriFactor->csrMat->column_indices->data().get(), 1569 upTriFactor->solveInfo, 1570 xarray, tempGPU->data().get() 1571 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1572 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1573 #endif 1574 );CHKERRCUSPARSE(stat); 1575 1576 /* Last, reorder with the column permutation */ 1577 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1578 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1579 xGPU); 1580 1581 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1582 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1583 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1584 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1585 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1586 PetscFunctionReturn(0); 1587 } 1588 1589 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1590 { 1591 const PetscScalar *barray; 1592 PetscScalar *xarray; 1593 cusparseStatus_t stat; 1594 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1595 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1596 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1597 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1598 PetscErrorCode ierr; 1599 cudaError_t cerr; 1600 1601 PetscFunctionBegin; 1602 /* Get the GPU pointers */ 1603 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1604 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1605 1606 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1607 /* First, solve L */ 1608 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1609 loTriFactor->csrMat->num_rows, 1610 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1611 loTriFactor->csrMat->num_entries, 1612 #endif 1613 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1614 loTriFactor->csrMat->values->data().get(), 1615 loTriFactor->csrMat->row_offsets->data().get(), 1616 loTriFactor->csrMat->column_indices->data().get(), 1617 loTriFactor->solveInfo, 1618 barray, tempGPU->data().get() 1619 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1620 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1621 #endif 1622 );CHKERRCUSPARSE(stat); 1623 1624 /* Next, solve U */ 1625 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1626 upTriFactor->csrMat->num_rows, 1627 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1628 upTriFactor->csrMat->num_entries, 1629 #endif 1630 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1631 upTriFactor->csrMat->values->data().get(), 1632 upTriFactor->csrMat->row_offsets->data().get(), 1633 upTriFactor->csrMat->column_indices->data().get(), 1634 upTriFactor->solveInfo, 1635 tempGPU->data().get(), xarray 1636 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1637 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1638 #endif 1639 );CHKERRCUSPARSE(stat); 1640 1641 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1642 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1643 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1644 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1645 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1646 PetscFunctionReturn(0); 1647 } 1648 1649 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1650 { 1651 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1652 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1653 cudaError_t cerr; 1654 PetscErrorCode ierr; 1655 1656 PetscFunctionBegin; 1657 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1658 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1659 1660 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1661 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1662 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1663 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1664 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1665 A->offloadmask = PETSC_OFFLOAD_BOTH; 1666 } 1667 PetscFunctionReturn(0); 1668 } 1669 1670 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1671 { 1672 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1673 PetscErrorCode ierr; 1674 1675 PetscFunctionBegin; 1676 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1677 *array = a->a; 1678 A->offloadmask = PETSC_OFFLOAD_CPU; 1679 PetscFunctionReturn(0); 1680 } 1681 1682 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1683 { 1684 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1685 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1686 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1687 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1688 PetscErrorCode ierr; 1689 cusparseStatus_t stat; 1690 PetscBool both = PETSC_TRUE; 1691 cudaError_t err; 1692 1693 PetscFunctionBegin; 1694 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU"); 1695 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1696 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1697 /* Copy values only */ 1698 CsrMatrix *matrix,*matrixT; 1699 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1700 1701 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values"); 1702 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1703 matrix->values->assign(a->a, a->a+a->nz); 1704 err = WaitForCUDA();CHKERRCUDA(err); 1705 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1706 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1707 1708 /* Update matT when it was built before */ 1709 if (cusparsestruct->matTranspose) { 1710 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1711 matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1712 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1713 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1714 A->cmap->n, matrix->num_entries, 1715 matrix->values->data().get(), 1716 cusparsestruct->rowoffsets_gpu->data().get(), 1717 matrix->column_indices->data().get(), 1718 matrixT->values->data().get(), 1719 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1720 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1721 CUSPARSE_ACTION_NUMERIC,indexBase, 1722 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1723 #else 1724 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1725 CUSPARSE_ACTION_NUMERIC, indexBase 1726 #endif 1727 );CHKERRCUSPARSE(stat); 1728 err = WaitForCUDA();CHKERRCUDA(err); 1729 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1730 } 1731 } else { 1732 PetscInt nnz; 1733 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1734 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1735 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1736 delete cusparsestruct->workVector; 1737 delete cusparsestruct->rowoffsets_gpu; 1738 try { 1739 if (a->compressedrow.use) { 1740 m = a->compressedrow.nrows; 1741 ii = a->compressedrow.i; 1742 ridx = a->compressedrow.rindex; 1743 } else { 1744 m = A->rmap->n; 1745 ii = a->i; 1746 ridx = NULL; 1747 } 1748 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data"); 1749 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data"); 1750 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1751 else nnz = a->nz; 1752 1753 /* create cusparse matrix */ 1754 cusparsestruct->nrows = m; 1755 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1756 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1757 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1758 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1759 1760 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1761 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1762 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1763 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1764 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1765 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1766 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1767 1768 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1769 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1770 /* set the matrix */ 1771 CsrMatrix *mat= new CsrMatrix; 1772 mat->num_rows = m; 1773 mat->num_cols = A->cmap->n; 1774 mat->num_entries = nnz; 1775 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1776 mat->row_offsets->assign(ii, ii + m+1); 1777 1778 mat->column_indices = new THRUSTINTARRAY32(nnz); 1779 mat->column_indices->assign(a->j, a->j+nnz); 1780 1781 mat->values = new THRUSTARRAY(nnz); 1782 if (a->a) mat->values->assign(a->a, a->a+nnz); 1783 1784 /* assign the pointer */ 1785 matstruct->mat = mat; 1786 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1787 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1788 stat = cusparseCreateCsr(&matstruct->matDescr, 1789 mat->num_rows, mat->num_cols, mat->num_entries, 1790 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1791 mat->values->data().get(), 1792 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1793 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1794 } 1795 #endif 1796 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1797 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1798 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1799 #else 1800 CsrMatrix *mat= new CsrMatrix; 1801 mat->num_rows = m; 1802 mat->num_cols = A->cmap->n; 1803 mat->num_entries = nnz; 1804 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1805 mat->row_offsets->assign(ii, ii + m+1); 1806 1807 mat->column_indices = new THRUSTINTARRAY32(nnz); 1808 mat->column_indices->assign(a->j, a->j+nnz); 1809 1810 mat->values = new THRUSTARRAY(nnz); 1811 if (a->a) mat->values->assign(a->a, a->a+nnz); 1812 1813 cusparseHybMat_t hybMat; 1814 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1815 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1816 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1817 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1818 matstruct->descr, mat->values->data().get(), 1819 mat->row_offsets->data().get(), 1820 mat->column_indices->data().get(), 1821 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1822 /* assign the pointer */ 1823 matstruct->mat = hybMat; 1824 1825 if (mat) { 1826 if (mat->values) delete (THRUSTARRAY*)mat->values; 1827 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1828 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1829 delete (CsrMatrix*)mat; 1830 } 1831 #endif 1832 } 1833 1834 /* assign the compressed row indices */ 1835 if (a->compressedrow.use) { 1836 cusparsestruct->workVector = new THRUSTARRAY(m); 1837 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1838 matstruct->cprowIndices->assign(ridx,ridx+m); 1839 tmp = m; 1840 } else { 1841 cusparsestruct->workVector = NULL; 1842 matstruct->cprowIndices = NULL; 1843 tmp = 0; 1844 } 1845 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1846 1847 /* assign the pointer */ 1848 cusparsestruct->mat = matstruct; 1849 } catch(char *ex) { 1850 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1851 } 1852 err = WaitForCUDA();CHKERRCUDA(err); 1853 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1854 cusparsestruct->nonzerostate = A->nonzerostate; 1855 } 1856 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1857 } 1858 PetscFunctionReturn(0); 1859 } 1860 1861 struct VecCUDAPlusEquals 1862 { 1863 template <typename Tuple> 1864 __host__ __device__ 1865 void operator()(Tuple t) 1866 { 1867 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1868 } 1869 }; 1870 1871 struct VecCUDAEquals 1872 { 1873 template <typename Tuple> 1874 __host__ __device__ 1875 void operator()(Tuple t) 1876 { 1877 thrust::get<1>(t) = thrust::get<0>(t); 1878 } 1879 }; 1880 1881 struct VecCUDAEqualsReverse 1882 { 1883 template <typename Tuple> 1884 __host__ __device__ 1885 void operator()(Tuple t) 1886 { 1887 thrust::get<0>(t) = thrust::get<1>(t); 1888 } 1889 }; 1890 1891 struct MatMatCusparse { 1892 PetscBool cisdense; 1893 PetscScalar *Bt; 1894 Mat X; 1895 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1896 PetscLogDouble flops; 1897 CsrMatrix *Bcsr; 1898 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1899 cusparseSpMatDescr_t matSpBDescr; 1900 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1901 cusparseDnMatDescr_t matBDescr; 1902 cusparseDnMatDescr_t matCDescr; 1903 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1904 size_t mmBufferSize; 1905 void *mmBuffer; 1906 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1907 cusparseSpGEMMDescr_t spgemmDesc; 1908 #endif 1909 }; 1910 1911 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1912 { 1913 PetscErrorCode ierr; 1914 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1915 cudaError_t cerr; 1916 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1917 cusparseStatus_t stat; 1918 #endif 1919 1920 PetscFunctionBegin; 1921 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1922 delete mmdata->Bcsr; 1923 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1924 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1925 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1926 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1927 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1928 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1929 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1930 #endif 1931 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1932 ierr = PetscFree(data);CHKERRQ(ierr); 1933 PetscFunctionReturn(0); 1934 } 1935 1936 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1937 1938 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1939 { 1940 Mat_Product *product = C->product; 1941 Mat A,B; 1942 PetscInt m,n,blda,clda; 1943 PetscBool flg,biscuda; 1944 Mat_SeqAIJCUSPARSE *cusp; 1945 cusparseStatus_t stat; 1946 cusparseOperation_t opA; 1947 const PetscScalar *barray; 1948 PetscScalar *carray; 1949 PetscErrorCode ierr; 1950 MatMatCusparse *mmdata; 1951 Mat_SeqAIJCUSPARSEMultStruct *mat; 1952 CsrMatrix *csrmat; 1953 cudaError_t cerr; 1954 1955 PetscFunctionBegin; 1956 MatCheckProduct(C,1); 1957 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1958 mmdata = (MatMatCusparse*)product->data; 1959 A = product->A; 1960 B = product->B; 1961 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1962 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1963 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1964 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1965 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1966 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1967 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1968 switch (product->type) { 1969 case MATPRODUCT_AB: 1970 case MATPRODUCT_PtAP: 1971 mat = cusp->mat; 1972 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1973 m = A->rmap->n; 1974 n = B->cmap->n; 1975 break; 1976 case MATPRODUCT_AtB: 1977 if (!cusp->transgen) { 1978 mat = cusp->mat; 1979 opA = CUSPARSE_OPERATION_TRANSPOSE; 1980 } else { 1981 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1982 mat = cusp->matTranspose; 1983 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1984 } 1985 m = A->cmap->n; 1986 n = B->cmap->n; 1987 break; 1988 case MATPRODUCT_ABt: 1989 case MATPRODUCT_RARt: 1990 mat = cusp->mat; 1991 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1992 m = A->rmap->n; 1993 n = B->rmap->n; 1994 break; 1995 default: 1996 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1997 } 1998 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1999 csrmat = (CsrMatrix*)mat->mat; 2000 /* if the user passed a CPU matrix, copy the data to the GPU */ 2001 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2002 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2003 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2004 2005 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2006 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2007 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2008 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2009 } else { 2010 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2011 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2012 } 2013 2014 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2015 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2016 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2017 /* (re)allcoate mmBuffer if not initialized or LDAs are different */ 2018 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2019 size_t mmBufferSize; 2020 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2021 if (!mmdata->matBDescr) { 2022 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2023 mmdata->Blda = blda; 2024 } 2025 2026 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2027 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2028 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2029 mmdata->Clda = clda; 2030 } 2031 2032 if (!mat->matDescr) { 2033 stat = cusparseCreateCsr(&mat->matDescr, 2034 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2035 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2036 csrmat->values->data().get(), 2037 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2038 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2039 } 2040 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2041 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2042 mmdata->matCDescr,cusparse_scalartype, 2043 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2044 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2045 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2046 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2047 mmdata->mmBufferSize = mmBufferSize; 2048 } 2049 mmdata->initialized = PETSC_TRUE; 2050 } else { 2051 /* to be safe, always update pointers of the mats */ 2052 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2053 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2054 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2055 } 2056 2057 /* do cusparseSpMM, which supports transpose on B */ 2058 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2059 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2060 mmdata->matCDescr,cusparse_scalartype, 2061 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2062 #else 2063 PetscInt k; 2064 /* cusparseXcsrmm does not support transpose on B */ 2065 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2066 cublasHandle_t cublasv2handle; 2067 cublasStatus_t cerr; 2068 2069 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2070 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2071 B->cmap->n,B->rmap->n, 2072 &PETSC_CUSPARSE_ONE ,barray,blda, 2073 &PETSC_CUSPARSE_ZERO,barray,blda, 2074 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2075 blda = B->cmap->n; 2076 k = B->cmap->n; 2077 } else { 2078 k = B->rmap->n; 2079 } 2080 2081 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2082 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2083 csrmat->num_entries,mat->alpha_one,mat->descr, 2084 csrmat->values->data().get(), 2085 csrmat->row_offsets->data().get(), 2086 csrmat->column_indices->data().get(), 2087 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2088 carray,clda);CHKERRCUSPARSE(stat); 2089 #endif 2090 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2091 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2092 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2093 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2094 if (product->type == MATPRODUCT_RARt) { 2095 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2096 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2097 } else if (product->type == MATPRODUCT_PtAP) { 2098 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2099 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2100 } else { 2101 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2102 } 2103 if (mmdata->cisdense) { 2104 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2105 } 2106 if (!biscuda) { 2107 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2108 } 2109 PetscFunctionReturn(0); 2110 } 2111 2112 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2113 { 2114 Mat_Product *product = C->product; 2115 Mat A,B; 2116 PetscInt m,n; 2117 PetscBool cisdense,flg; 2118 PetscErrorCode ierr; 2119 MatMatCusparse *mmdata; 2120 Mat_SeqAIJCUSPARSE *cusp; 2121 2122 PetscFunctionBegin; 2123 MatCheckProduct(C,1); 2124 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2125 A = product->A; 2126 B = product->B; 2127 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2128 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2129 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2130 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2131 switch (product->type) { 2132 case MATPRODUCT_AB: 2133 m = A->rmap->n; 2134 n = B->cmap->n; 2135 break; 2136 case MATPRODUCT_AtB: 2137 m = A->cmap->n; 2138 n = B->cmap->n; 2139 break; 2140 case MATPRODUCT_ABt: 2141 m = A->rmap->n; 2142 n = B->rmap->n; 2143 break; 2144 case MATPRODUCT_PtAP: 2145 m = B->cmap->n; 2146 n = B->cmap->n; 2147 break; 2148 case MATPRODUCT_RARt: 2149 m = B->rmap->n; 2150 n = B->rmap->n; 2151 break; 2152 default: 2153 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2154 } 2155 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2156 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2157 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2158 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2159 2160 /* product data */ 2161 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2162 mmdata->cisdense = cisdense; 2163 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2164 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2165 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2166 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2167 } 2168 #endif 2169 /* for these products we need intermediate storage */ 2170 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2171 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2172 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2173 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2174 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2175 } else { 2176 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2177 } 2178 } 2179 C->product->data = mmdata; 2180 C->product->destroy = MatDestroy_MatMatCusparse; 2181 2182 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2183 PetscFunctionReturn(0); 2184 } 2185 2186 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2187 { 2188 Mat_Product *product = C->product; 2189 Mat A,B; 2190 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2191 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2192 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2193 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2194 PetscBool flg; 2195 PetscErrorCode ierr; 2196 cusparseStatus_t stat; 2197 cudaError_t cerr; 2198 MatProductType ptype; 2199 MatMatCusparse *mmdata; 2200 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2201 cusparseSpMatDescr_t BmatSpDescr; 2202 #endif 2203 2204 PetscFunctionBegin; 2205 MatCheckProduct(C,1); 2206 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 2207 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2208 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name); 2209 mmdata = (MatMatCusparse*)C->product->data; 2210 A = product->A; 2211 B = product->B; 2212 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2213 mmdata->reusesym = PETSC_FALSE; 2214 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2215 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2216 Cmat = Ccusp->mat; 2217 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2218 Ccsr = (CsrMatrix*)Cmat->mat; 2219 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2220 goto finalize; 2221 } 2222 if (!c->nz) goto finalize; 2223 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2224 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2225 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2226 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2227 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2228 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2229 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2230 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2231 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2232 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2233 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2234 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2235 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2236 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2237 2238 ptype = product->type; 2239 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2240 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2241 switch (ptype) { 2242 case MATPRODUCT_AB: 2243 Amat = Acusp->mat; 2244 Bmat = Bcusp->mat; 2245 break; 2246 case MATPRODUCT_AtB: 2247 Amat = Acusp->matTranspose; 2248 Bmat = Bcusp->mat; 2249 break; 2250 case MATPRODUCT_ABt: 2251 Amat = Acusp->mat; 2252 Bmat = Bcusp->matTranspose; 2253 break; 2254 default: 2255 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2256 } 2257 Cmat = Ccusp->mat; 2258 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2259 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2260 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2261 Acsr = (CsrMatrix*)Amat->mat; 2262 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2263 Ccsr = (CsrMatrix*)Cmat->mat; 2264 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2265 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2266 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2267 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2268 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2269 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2270 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2271 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2272 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2273 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2274 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2275 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2276 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2277 #else 2278 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2279 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2280 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2281 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2282 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2283 #endif 2284 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2285 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2286 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2287 C->offloadmask = PETSC_OFFLOAD_GPU; 2288 finalize: 2289 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2290 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2291 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2292 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2293 c->reallocs = 0; 2294 C->info.mallocs += 0; 2295 C->info.nz_unneeded = 0; 2296 C->assembled = C->was_assembled = PETSC_TRUE; 2297 C->num_ass++; 2298 PetscFunctionReturn(0); 2299 } 2300 2301 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2302 { 2303 Mat_Product *product = C->product; 2304 Mat A,B; 2305 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2306 Mat_SeqAIJ *a,*b,*c; 2307 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2308 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2309 PetscInt i,j,m,n,k; 2310 PetscBool flg; 2311 PetscErrorCode ierr; 2312 cusparseStatus_t stat; 2313 cudaError_t cerr; 2314 MatProductType ptype; 2315 MatMatCusparse *mmdata; 2316 PetscLogDouble flops; 2317 PetscBool biscompressed,ciscompressed; 2318 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2319 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2320 size_t bufSize2; 2321 cusparseSpMatDescr_t BmatSpDescr; 2322 #else 2323 int cnz; 2324 #endif 2325 2326 PetscFunctionBegin; 2327 MatCheckProduct(C,1); 2328 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2329 A = product->A; 2330 B = product->B; 2331 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2332 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2333 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2334 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2335 a = (Mat_SeqAIJ*)A->data; 2336 b = (Mat_SeqAIJ*)B->data; 2337 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2338 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2339 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2340 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2341 2342 /* product data */ 2343 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2344 C->product->data = mmdata; 2345 C->product->destroy = MatDestroy_MatMatCusparse; 2346 2347 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2348 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2349 ptype = product->type; 2350 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2351 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2352 biscompressed = PETSC_FALSE; 2353 ciscompressed = PETSC_FALSE; 2354 switch (ptype) { 2355 case MATPRODUCT_AB: 2356 m = A->rmap->n; 2357 n = B->cmap->n; 2358 k = A->cmap->n; 2359 Amat = Acusp->mat; 2360 Bmat = Bcusp->mat; 2361 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2362 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2363 break; 2364 case MATPRODUCT_AtB: 2365 m = A->cmap->n; 2366 n = B->cmap->n; 2367 k = A->rmap->n; 2368 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 2369 Amat = Acusp->matTranspose; 2370 Bmat = Bcusp->mat; 2371 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2372 break; 2373 case MATPRODUCT_ABt: 2374 m = A->rmap->n; 2375 n = B->rmap->n; 2376 k = A->cmap->n; 2377 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr); 2378 Amat = Acusp->mat; 2379 Bmat = Bcusp->matTranspose; 2380 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2381 break; 2382 default: 2383 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2384 } 2385 2386 /* create cusparse matrix */ 2387 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2388 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2389 c = (Mat_SeqAIJ*)C->data; 2390 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2391 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2392 Ccsr = new CsrMatrix; 2393 2394 c->compressedrow.use = ciscompressed; 2395 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2396 c->compressedrow.nrows = a->compressedrow.nrows; 2397 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2398 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2399 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2400 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2401 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2402 } else { 2403 c->compressedrow.nrows = 0; 2404 c->compressedrow.i = NULL; 2405 c->compressedrow.rindex = NULL; 2406 Ccusp->workVector = NULL; 2407 Cmat->cprowIndices = NULL; 2408 } 2409 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2410 Ccusp->mat = Cmat; 2411 Ccusp->mat->mat = Ccsr; 2412 Ccsr->num_rows = Ccusp->nrows; 2413 Ccsr->num_cols = n; 2414 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2415 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2416 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2417 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2418 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2419 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2420 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2421 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2422 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2423 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2424 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2425 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2426 c->nz = 0; 2427 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2428 Ccsr->values = new THRUSTARRAY(c->nz); 2429 goto finalizesym; 2430 } 2431 2432 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2433 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2434 Acsr = (CsrMatrix*)Amat->mat; 2435 if (!biscompressed) { 2436 Bcsr = (CsrMatrix*)Bmat->mat; 2437 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2438 BmatSpDescr = Bmat->matDescr; 2439 #endif 2440 } else { /* we need to use row offsets for the full matrix */ 2441 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2442 Bcsr = new CsrMatrix; 2443 Bcsr->num_rows = B->rmap->n; 2444 Bcsr->num_cols = cBcsr->num_cols; 2445 Bcsr->num_entries = cBcsr->num_entries; 2446 Bcsr->column_indices = cBcsr->column_indices; 2447 Bcsr->values = cBcsr->values; 2448 if (!Bcusp->rowoffsets_gpu) { 2449 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2450 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2451 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2452 } 2453 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2454 mmdata->Bcsr = Bcsr; 2455 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2456 if (Bcsr->num_rows && Bcsr->num_cols) { 2457 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2458 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2459 Bcsr->values->data().get(), 2460 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2461 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2462 } 2463 BmatSpDescr = mmdata->matSpBDescr; 2464 #endif 2465 } 2466 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2467 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2468 /* precompute flops count */ 2469 if (ptype == MATPRODUCT_AB) { 2470 for (i=0, flops = 0; i<A->rmap->n; i++) { 2471 const PetscInt st = a->i[i]; 2472 const PetscInt en = a->i[i+1]; 2473 for (j=st; j<en; j++) { 2474 const PetscInt brow = a->j[j]; 2475 flops += 2.*(b->i[brow+1] - b->i[brow]); 2476 } 2477 } 2478 } else if (ptype == MATPRODUCT_AtB) { 2479 for (i=0, flops = 0; i<A->rmap->n; i++) { 2480 const PetscInt anzi = a->i[i+1] - a->i[i]; 2481 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2482 flops += (2.*anzi)*bnzi; 2483 } 2484 } else { /* TODO */ 2485 flops = 0.; 2486 } 2487 2488 mmdata->flops = flops; 2489 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2490 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2491 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2492 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2493 NULL, NULL, NULL, 2494 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2495 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2496 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2497 /* ask bufferSize bytes for external memory */ 2498 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2499 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2500 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2501 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2502 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2503 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2504 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2505 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2506 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2507 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2508 /* ask bufferSize again bytes for external memory */ 2509 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2510 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2511 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2512 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2513 /* The CUSPARSE documentation is not clear, nor the API 2514 We need both buffers to perform the operations properly! 2515 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2516 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2517 is stored in the descriptor! What a messy API... */ 2518 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2519 /* compute the intermediate product of A * B */ 2520 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2521 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2522 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2523 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2524 /* get matrix C non-zero entries C_nnz1 */ 2525 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2526 c->nz = (PetscInt) C_nnz1; 2527 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2528 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2529 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2530 Ccsr->values = new THRUSTARRAY(c->nz); 2531 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2532 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2533 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2534 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2535 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2536 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2537 #else 2538 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2539 stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2540 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2541 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2542 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2543 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2544 c->nz = cnz; 2545 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2546 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2547 Ccsr->values = new THRUSTARRAY(c->nz); 2548 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2549 2550 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2551 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2552 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2553 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2554 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2555 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2556 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2557 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2558 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2559 #endif 2560 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2561 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2562 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2563 finalizesym: 2564 c->singlemalloc = PETSC_FALSE; 2565 c->free_a = PETSC_TRUE; 2566 c->free_ij = PETSC_TRUE; 2567 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2568 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2569 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2570 PetscInt *d_i = c->i; 2571 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2572 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2573 ii = *Ccsr->row_offsets; 2574 jj = *Ccsr->column_indices; 2575 if (ciscompressed) d_i = c->compressedrow.i; 2576 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2577 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2578 } else { 2579 PetscInt *d_i = c->i; 2580 if (ciscompressed) d_i = c->compressedrow.i; 2581 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2582 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2583 } 2584 if (ciscompressed) { /* need to expand host row offsets */ 2585 PetscInt r = 0; 2586 c->i[0] = 0; 2587 for (k = 0; k < c->compressedrow.nrows; k++) { 2588 const PetscInt next = c->compressedrow.rindex[k]; 2589 const PetscInt old = c->compressedrow.i[k]; 2590 for (; r < next; r++) c->i[r+1] = old; 2591 } 2592 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2593 } 2594 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2595 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2596 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2597 c->maxnz = c->nz; 2598 c->nonzerorowcnt = 0; 2599 c->rmax = 0; 2600 for (k = 0; k < m; k++) { 2601 const PetscInt nn = c->i[k+1] - c->i[k]; 2602 c->ilen[k] = c->imax[k] = nn; 2603 c->nonzerorowcnt += (PetscInt)!!nn; 2604 c->rmax = PetscMax(c->rmax,nn); 2605 } 2606 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2607 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2608 Ccsr->num_entries = c->nz; 2609 2610 C->nonzerostate++; 2611 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2612 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2613 Ccusp->nonzerostate = C->nonzerostate; 2614 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2615 C->preallocated = PETSC_TRUE; 2616 C->assembled = PETSC_FALSE; 2617 C->was_assembled = PETSC_FALSE; 2618 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2619 mmdata->reusesym = PETSC_TRUE; 2620 C->offloadmask = PETSC_OFFLOAD_GPU; 2621 } 2622 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2623 PetscFunctionReturn(0); 2624 } 2625 2626 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2627 2628 /* handles sparse or dense B */ 2629 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2630 { 2631 Mat_Product *product = mat->product; 2632 PetscErrorCode ierr; 2633 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2634 2635 PetscFunctionBegin; 2636 MatCheckProduct(mat,1); 2637 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2638 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2639 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2640 } 2641 if (product->type == MATPRODUCT_ABC) { 2642 Ciscusp = PETSC_FALSE; 2643 if (!product->C->boundtocpu) { 2644 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2645 } 2646 } 2647 if (isdense) { 2648 switch (product->type) { 2649 case MATPRODUCT_AB: 2650 case MATPRODUCT_AtB: 2651 case MATPRODUCT_ABt: 2652 case MATPRODUCT_PtAP: 2653 case MATPRODUCT_RARt: 2654 if (product->A->boundtocpu) { 2655 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2656 } else { 2657 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2658 } 2659 break; 2660 case MATPRODUCT_ABC: 2661 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2662 break; 2663 default: 2664 break; 2665 } 2666 } else if (Biscusp && Ciscusp) { 2667 switch (product->type) { 2668 case MATPRODUCT_AB: 2669 case MATPRODUCT_AtB: 2670 case MATPRODUCT_ABt: 2671 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2672 break; 2673 case MATPRODUCT_PtAP: 2674 case MATPRODUCT_RARt: 2675 case MATPRODUCT_ABC: 2676 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2677 break; 2678 default: 2679 break; 2680 } 2681 } else { /* fallback for AIJ */ 2682 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2683 } 2684 PetscFunctionReturn(0); 2685 } 2686 2687 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2688 { 2689 PetscErrorCode ierr; 2690 2691 PetscFunctionBegin; 2692 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2693 PetscFunctionReturn(0); 2694 } 2695 2696 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2697 { 2698 PetscErrorCode ierr; 2699 2700 PetscFunctionBegin; 2701 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2702 PetscFunctionReturn(0); 2703 } 2704 2705 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2706 { 2707 PetscErrorCode ierr; 2708 2709 PetscFunctionBegin; 2710 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2711 PetscFunctionReturn(0); 2712 } 2713 2714 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2715 { 2716 PetscErrorCode ierr; 2717 2718 PetscFunctionBegin; 2719 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2720 PetscFunctionReturn(0); 2721 } 2722 2723 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2724 { 2725 PetscErrorCode ierr; 2726 2727 PetscFunctionBegin; 2728 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2729 PetscFunctionReturn(0); 2730 } 2731 2732 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2733 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2734 { 2735 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2736 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2737 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2738 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2739 PetscErrorCode ierr; 2740 cudaError_t cerr; 2741 cusparseStatus_t stat; 2742 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2743 PetscBool compressed; 2744 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2745 PetscInt nx,ny; 2746 #endif 2747 2748 PetscFunctionBegin; 2749 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2750 if (!a->nonzerorowcnt) { 2751 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2752 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2753 PetscFunctionReturn(0); 2754 } 2755 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2756 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2757 if (!trans) { 2758 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2759 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2760 } else { 2761 if (herm || !cusparsestruct->transgen) { 2762 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2763 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2764 } else { 2765 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);} 2766 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2767 } 2768 } 2769 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2770 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2771 2772 try { 2773 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2774 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2775 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2776 2777 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2778 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2779 /* z = A x + beta y. 2780 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2781 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2782 */ 2783 xptr = xarray; 2784 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2785 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2786 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2787 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2788 allocated to accommodate different uses. So we get the length info directly from mat. 2789 */ 2790 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2791 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2792 nx = mat->num_cols; 2793 ny = mat->num_rows; 2794 } 2795 #endif 2796 } else { 2797 /* z = A^T x + beta y 2798 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2799 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2800 */ 2801 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2802 dptr = zarray; 2803 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2804 if (compressed) { /* Scatter x to work vector */ 2805 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2806 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2807 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2808 VecCUDAEqualsReverse()); 2809 } 2810 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2811 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2812 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2813 nx = mat->num_rows; 2814 ny = mat->num_cols; 2815 } 2816 #endif 2817 } 2818 2819 /* csr_spmv does y = alpha op(A) x + beta y */ 2820 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2821 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2822 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2823 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2824 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2825 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2826 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2827 matstruct->matDescr, 2828 matstruct->cuSpMV[opA].vecXDescr, beta, 2829 matstruct->cuSpMV[opA].vecYDescr, 2830 cusparse_scalartype, 2831 cusparsestruct->spmvAlg, 2832 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2833 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2834 2835 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2836 } else { 2837 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2838 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2839 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2840 } 2841 2842 stat = cusparseSpMV(cusparsestruct->handle, opA, 2843 matstruct->alpha_one, 2844 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */ 2845 matstruct->cuSpMV[opA].vecXDescr, 2846 beta, 2847 matstruct->cuSpMV[opA].vecYDescr, 2848 cusparse_scalartype, 2849 cusparsestruct->spmvAlg, 2850 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2851 #else 2852 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2853 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2854 mat->num_rows, mat->num_cols, 2855 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2856 mat->values->data().get(), mat->row_offsets->data().get(), 2857 mat->column_indices->data().get(), xptr, beta, 2858 dptr);CHKERRCUSPARSE(stat); 2859 #endif 2860 } else { 2861 if (cusparsestruct->nrows) { 2862 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2863 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2864 #else 2865 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2866 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2867 matstruct->alpha_one, matstruct->descr, hybMat, 2868 xptr, beta, 2869 dptr);CHKERRCUSPARSE(stat); 2870 #endif 2871 } 2872 } 2873 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2874 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2875 2876 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2877 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2878 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2879 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2880 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2881 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2882 } 2883 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2884 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2885 } 2886 2887 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2888 if (compressed) { 2889 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2890 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2891 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2892 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2893 VecCUDAPlusEquals()); 2894 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2895 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2896 } 2897 } else { 2898 if (yy && yy != zz) { 2899 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2900 } 2901 } 2902 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2903 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2904 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2905 } catch(char *ex) { 2906 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2907 } 2908 if (yy) { 2909 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2910 } else { 2911 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2912 } 2913 PetscFunctionReturn(0); 2914 } 2915 2916 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2917 { 2918 PetscErrorCode ierr; 2919 2920 PetscFunctionBegin; 2921 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2922 PetscFunctionReturn(0); 2923 } 2924 2925 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2926 { 2927 PetscErrorCode ierr; 2928 PetscSplitCSRDataStructure *d_mat = NULL; 2929 PetscFunctionBegin; 2930 if (A->factortype == MAT_FACTOR_NONE) { 2931 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2932 } 2933 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2934 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2935 if (d_mat) { 2936 A->offloadmask = PETSC_OFFLOAD_GPU; 2937 } 2938 2939 PetscFunctionReturn(0); 2940 } 2941 2942 /* --------------------------------------------------------------------------------*/ 2943 /*@ 2944 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2945 (the default parallel PETSc format). This matrix will ultimately pushed down 2946 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2947 assembly performance the user should preallocate the matrix storage by setting 2948 the parameter nz (or the array nnz). By setting these parameters accurately, 2949 performance during matrix assembly can be increased by more than a factor of 50. 2950 2951 Collective 2952 2953 Input Parameters: 2954 + comm - MPI communicator, set to PETSC_COMM_SELF 2955 . m - number of rows 2956 . n - number of columns 2957 . nz - number of nonzeros per row (same for all rows) 2958 - nnz - array containing the number of nonzeros in the various rows 2959 (possibly different for each row) or NULL 2960 2961 Output Parameter: 2962 . A - the matrix 2963 2964 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2965 MatXXXXSetPreallocation() paradgm instead of this routine directly. 2966 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 2967 2968 Notes: 2969 If nnz is given then nz is ignored 2970 2971 The AIJ format (also called the Yale sparse matrix format or 2972 compressed row storage), is fully compatible with standard Fortran 77 2973 storage. That is, the stored row and column indices can begin at 2974 either one (as in Fortran) or zero. See the users' manual for details. 2975 2976 Specify the preallocated storage with either nz or nnz (not both). 2977 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 2978 allocation. For large problems you MUST preallocate memory or you 2979 will get TERRIBLE performance, see the users' manual chapter on matrices. 2980 2981 By default, this format uses inodes (identical nodes) when possible, to 2982 improve numerical efficiency of matrix-vector products and solves. We 2983 search for consecutive rows with the same nonzero structure, thereby 2984 reusing matrix information to achieve increased efficiency. 2985 2986 Level: intermediate 2987 2988 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 2989 @*/ 2990 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 2991 { 2992 PetscErrorCode ierr; 2993 2994 PetscFunctionBegin; 2995 ierr = MatCreate(comm,A);CHKERRQ(ierr); 2996 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 2997 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2998 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 2999 PetscFunctionReturn(0); 3000 } 3001 3002 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3003 { 3004 PetscErrorCode ierr; 3005 PetscSplitCSRDataStructure *d_mat = NULL; 3006 3007 PetscFunctionBegin; 3008 if (A->factortype == MAT_FACTOR_NONE) { 3009 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 3010 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 3011 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3012 } else { 3013 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3014 } 3015 if (d_mat) { 3016 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3017 cudaError_t err; 3018 PetscSplitCSRDataStructure h_mat; 3019 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 3020 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 3021 if (a->compressedrow.use) { 3022 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 3023 } 3024 err = cudaFree(d_mat);CHKERRCUDA(err); 3025 } 3026 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3027 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3028 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3029 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3030 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3031 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3032 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3033 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3034 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3035 PetscFunctionReturn(0); 3036 } 3037 3038 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3039 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3040 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3041 { 3042 PetscErrorCode ierr; 3043 3044 PetscFunctionBegin; 3045 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3046 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3047 PetscFunctionReturn(0); 3048 } 3049 3050 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3051 { 3052 PetscErrorCode ierr; 3053 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3054 Mat_SeqAIJCUSPARSE *cy; 3055 Mat_SeqAIJCUSPARSE *cx; 3056 PetscScalar *ay; 3057 const PetscScalar *ax; 3058 CsrMatrix *csry,*csrx; 3059 cudaError_t cerr; 3060 3061 PetscFunctionBegin; 3062 if (X->ops->axpy != Y->ops->axpy) { 3063 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3064 PetscFunctionReturn(0); 3065 } 3066 /* if we are here, it means both matrices are bound to GPU */ 3067 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3068 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3069 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3070 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3071 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3072 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3073 csry = (CsrMatrix*)cy->mat->mat; 3074 csrx = (CsrMatrix*)cx->mat->mat; 3075 /* see if we can turn this into a cublas axpy */ 3076 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3077 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3078 if (eq) { 3079 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3080 } 3081 if (eq) str = SAME_NONZERO_PATTERN; 3082 } 3083 /* spgeam is buggy with one column */ 3084 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3085 3086 if (str == SUBSET_NONZERO_PATTERN) { 3087 cusparseStatus_t stat; 3088 PetscScalar b = 1.0; 3089 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3090 size_t bufferSize; 3091 void *buffer; 3092 #endif 3093 3094 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3095 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3096 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3097 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3098 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3099 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3100 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3101 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3102 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3103 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3104 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3105 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3106 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3107 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3108 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3109 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3110 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3111 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3112 #else 3113 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3114 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3115 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3116 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3117 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3118 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3119 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3120 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3121 #endif 3122 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3123 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3124 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3125 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3126 } else if (str == SAME_NONZERO_PATTERN) { 3127 cublasHandle_t cublasv2handle; 3128 cublasStatus_t berr; 3129 PetscBLASInt one = 1, bnz = 1; 3130 3131 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3132 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3133 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3134 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3135 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3136 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3137 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3138 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3139 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3140 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3141 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3142 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3143 } else { 3144 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3145 } 3146 PetscFunctionReturn(0); 3147 } 3148 3149 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3150 { 3151 PetscErrorCode ierr; 3152 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3153 PetscScalar *ay; 3154 cudaError_t cerr; 3155 cublasHandle_t cublasv2handle; 3156 cublasStatus_t berr; 3157 PetscBLASInt one = 1, bnz = 1; 3158 3159 PetscFunctionBegin; 3160 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3161 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3162 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3163 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3164 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3165 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3166 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3167 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3168 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3169 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3170 PetscFunctionReturn(0); 3171 } 3172 3173 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3174 { 3175 PetscErrorCode ierr; 3176 PetscBool both = PETSC_FALSE; 3177 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3178 3179 PetscFunctionBegin; 3180 if (A->factortype == MAT_FACTOR_NONE) { 3181 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3182 if (spptr->mat) { 3183 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3184 if (matrix->values) { 3185 both = PETSC_TRUE; 3186 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3187 } 3188 } 3189 if (spptr->matTranspose) { 3190 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3191 if (matrix->values) { 3192 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3193 } 3194 } 3195 } 3196 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3197 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3198 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3199 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3200 else A->offloadmask = PETSC_OFFLOAD_CPU; 3201 3202 PetscFunctionReturn(0); 3203 } 3204 3205 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3206 { 3207 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3208 PetscErrorCode ierr; 3209 3210 PetscFunctionBegin; 3211 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3212 if (flg) { 3213 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3214 3215 A->ops->scale = MatScale_SeqAIJ; 3216 A->ops->axpy = MatAXPY_SeqAIJ; 3217 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3218 A->ops->mult = MatMult_SeqAIJ; 3219 A->ops->multadd = MatMultAdd_SeqAIJ; 3220 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3221 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3222 A->ops->multhermitiantranspose = NULL; 3223 A->ops->multhermitiantransposeadd = NULL; 3224 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3225 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3226 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3227 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3228 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3229 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3230 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3231 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3232 } else { 3233 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3234 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3235 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3236 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3237 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3238 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3239 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3240 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3241 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3242 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3243 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3244 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3245 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3246 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3247 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3248 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3249 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3250 } 3251 A->boundtocpu = flg; 3252 a->inode.use = flg; 3253 PetscFunctionReturn(0); 3254 } 3255 3256 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3257 { 3258 PetscErrorCode ierr; 3259 cusparseStatus_t stat; 3260 Mat B; 3261 3262 PetscFunctionBegin; 3263 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3264 if (reuse == MAT_INITIAL_MATRIX) { 3265 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3266 } else if (reuse == MAT_REUSE_MATRIX) { 3267 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3268 } 3269 B = *newmat; 3270 3271 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3272 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3273 3274 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3275 if (B->factortype == MAT_FACTOR_NONE) { 3276 Mat_SeqAIJCUSPARSE *spptr; 3277 3278 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3279 spptr->format = MAT_CUSPARSE_CSR; 3280 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3281 B->spptr = spptr; 3282 spptr->deviceMat = NULL; 3283 } else { 3284 Mat_SeqAIJCUSPARSETriFactors *spptr; 3285 3286 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3287 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3288 B->spptr = spptr; 3289 } 3290 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3291 } 3292 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3293 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3294 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3295 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3296 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3297 3298 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3299 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3300 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3301 PetscFunctionReturn(0); 3302 } 3303 3304 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3305 { 3306 PetscErrorCode ierr; 3307 3308 PetscFunctionBegin; 3309 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3310 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3311 ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr); 3312 ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr); 3313 ierr = PetscOptionsEnd();CHKERRQ(ierr); 3314 PetscFunctionReturn(0); 3315 } 3316 3317 /*MC 3318 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3319 3320 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3321 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3322 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3323 3324 Options Database Keys: 3325 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3326 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3327 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3328 3329 Level: beginner 3330 3331 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3332 M*/ 3333 3334 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 3335 3336 3337 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3338 { 3339 PetscErrorCode ierr; 3340 3341 PetscFunctionBegin; 3342 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3343 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3344 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3345 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3346 PetscFunctionReturn(0); 3347 } 3348 3349 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3350 { 3351 PetscErrorCode ierr; 3352 cusparseStatus_t stat; 3353 3354 PetscFunctionBegin; 3355 if (*cusparsestruct) { 3356 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3357 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3358 delete (*cusparsestruct)->workVector; 3359 delete (*cusparsestruct)->rowoffsets_gpu; 3360 delete (*cusparsestruct)->cooPerm; 3361 delete (*cusparsestruct)->cooPerm_a; 3362 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3363 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3364 cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr); 3365 #endif 3366 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3367 } 3368 PetscFunctionReturn(0); 3369 } 3370 3371 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3372 { 3373 PetscFunctionBegin; 3374 if (*mat) { 3375 delete (*mat)->values; 3376 delete (*mat)->column_indices; 3377 delete (*mat)->row_offsets; 3378 delete *mat; 3379 *mat = 0; 3380 } 3381 PetscFunctionReturn(0); 3382 } 3383 3384 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3385 { 3386 cusparseStatus_t stat; 3387 PetscErrorCode ierr; 3388 3389 PetscFunctionBegin; 3390 if (*trifactor) { 3391 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3392 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3393 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3394 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3395 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3396 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3397 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3398 #endif 3399 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3400 } 3401 PetscFunctionReturn(0); 3402 } 3403 3404 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3405 { 3406 CsrMatrix *mat; 3407 cusparseStatus_t stat; 3408 cudaError_t err; 3409 3410 PetscFunctionBegin; 3411 if (*matstruct) { 3412 if ((*matstruct)->mat) { 3413 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3414 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3415 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3416 #else 3417 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3418 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3419 #endif 3420 } else { 3421 mat = (CsrMatrix*)(*matstruct)->mat; 3422 CsrMatrix_Destroy(&mat); 3423 } 3424 } 3425 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3426 delete (*matstruct)->cprowIndices; 3427 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3428 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3429 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3430 3431 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3432 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3433 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3434 for (int i=0; i<3; i++) { 3435 if (mdata->cuSpMV[i].initialized) { 3436 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3437 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3438 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3439 } 3440 } 3441 #endif 3442 delete *matstruct; 3443 *matstruct = NULL; 3444 } 3445 PetscFunctionReturn(0); 3446 } 3447 3448 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3449 { 3450 PetscErrorCode ierr; 3451 3452 PetscFunctionBegin; 3453 if (*trifactors) { 3454 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3455 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3456 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3457 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3458 delete (*trifactors)->rpermIndices; 3459 delete (*trifactors)->cpermIndices; 3460 delete (*trifactors)->workVector; 3461 (*trifactors)->rpermIndices = NULL; 3462 (*trifactors)->cpermIndices = NULL; 3463 (*trifactors)->workVector = NULL; 3464 } 3465 PetscFunctionReturn(0); 3466 } 3467 3468 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3469 { 3470 PetscErrorCode ierr; 3471 cusparseHandle_t handle; 3472 cusparseStatus_t stat; 3473 3474 PetscFunctionBegin; 3475 if (*trifactors) { 3476 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3477 if (handle = (*trifactors)->handle) { 3478 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3479 } 3480 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3481 } 3482 PetscFunctionReturn(0); 3483 } 3484 3485 struct IJCompare 3486 { 3487 __host__ __device__ 3488 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3489 { 3490 if (t1.get<0>() < t2.get<0>()) return true; 3491 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3492 return false; 3493 } 3494 }; 3495 3496 struct IJEqual 3497 { 3498 __host__ __device__ 3499 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3500 { 3501 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3502 return true; 3503 } 3504 }; 3505 3506 struct IJDiff 3507 { 3508 __host__ __device__ 3509 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3510 { 3511 return t1 == t2 ? 0 : 1; 3512 } 3513 }; 3514 3515 struct IJSum 3516 { 3517 __host__ __device__ 3518 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3519 { 3520 return t1||t2; 3521 } 3522 }; 3523 3524 #include <thrust/iterator/discard_iterator.h> 3525 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3526 { 3527 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3528 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3529 THRUSTARRAY *cooPerm_v = NULL; 3530 thrust::device_ptr<const PetscScalar> d_v; 3531 CsrMatrix *matrix; 3532 PetscErrorCode ierr; 3533 cudaError_t cerr; 3534 PetscInt n; 3535 3536 PetscFunctionBegin; 3537 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3538 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3539 if (!cusp->cooPerm) { 3540 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3541 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3542 PetscFunctionReturn(0); 3543 } 3544 matrix = (CsrMatrix*)cusp->mat->mat; 3545 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3546 if (!v) { 3547 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3548 goto finalize; 3549 } 3550 n = cusp->cooPerm->size(); 3551 if (isCudaMem(v)) { 3552 d_v = thrust::device_pointer_cast(v); 3553 } else { 3554 cooPerm_v = new THRUSTARRAY(n); 3555 cooPerm_v->assign(v,v+n); 3556 d_v = cooPerm_v->data(); 3557 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3558 } 3559 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3560 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3561 if (cusp->cooPerm_a) { 3562 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3563 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3564 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3565 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3566 delete cooPerm_w; 3567 } else { 3568 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3569 matrix->values->begin())); 3570 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3571 matrix->values->end())); 3572 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 3573 } 3574 } else { 3575 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3576 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3577 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3578 } else { 3579 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3580 matrix->values->begin())); 3581 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3582 matrix->values->end())); 3583 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3584 } 3585 } 3586 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3587 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3588 finalize: 3589 delete cooPerm_v; 3590 A->offloadmask = PETSC_OFFLOAD_GPU; 3591 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3592 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3593 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3594 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3595 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3596 a->reallocs = 0; 3597 A->info.mallocs += 0; 3598 A->info.nz_unneeded = 0; 3599 A->assembled = A->was_assembled = PETSC_TRUE; 3600 A->num_ass++; 3601 PetscFunctionReturn(0); 3602 } 3603 3604 #include <thrust/binary_search.h> 3605 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3606 { 3607 PetscErrorCode ierr; 3608 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3609 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3610 PetscInt cooPerm_n, nzr = 0; 3611 cudaError_t cerr; 3612 3613 PetscFunctionBegin; 3614 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3615 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3616 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3617 if (n != cooPerm_n) { 3618 delete cusp->cooPerm; 3619 delete cusp->cooPerm_a; 3620 cusp->cooPerm = NULL; 3621 cusp->cooPerm_a = NULL; 3622 } 3623 if (n) { 3624 THRUSTINTARRAY d_i(n); 3625 THRUSTINTARRAY d_j(n); 3626 THRUSTINTARRAY ii(A->rmap->n); 3627 3628 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3629 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3630 3631 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3632 d_i.assign(coo_i,coo_i+n); 3633 d_j.assign(coo_j,coo_j+n); 3634 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3635 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3636 3637 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3638 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3639 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 3640 *cusp->cooPerm_a = d_i; 3641 THRUSTINTARRAY w = d_j; 3642 3643 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 3644 if (nekey == ekey) { /* all entries are unique */ 3645 delete cusp->cooPerm_a; 3646 cusp->cooPerm_a = NULL; 3647 } else { /* I couldn't come up with a more elegant algorithm */ 3648 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 3649 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 3650 (*cusp->cooPerm_a)[0] = 0; 3651 w[0] = 0; 3652 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 3653 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 3654 } 3655 thrust::counting_iterator<PetscInt> search_begin(0); 3656 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 3657 search_begin, search_begin + A->rmap->n, 3658 ii.begin()); 3659 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3660 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3661 3662 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3663 a->singlemalloc = PETSC_FALSE; 3664 a->free_a = PETSC_TRUE; 3665 a->free_ij = PETSC_TRUE; 3666 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3667 a->i[0] = 0; 3668 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3669 a->nz = a->maxnz = a->i[A->rmap->n]; 3670 a->rmax = 0; 3671 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3672 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3673 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3674 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3675 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3676 for (PetscInt i = 0; i < A->rmap->n; i++) { 3677 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3678 nzr += (PetscInt)!!(nnzr); 3679 a->ilen[i] = a->imax[i] = nnzr; 3680 a->rmax = PetscMax(a->rmax,nnzr); 3681 } 3682 a->nonzerorowcnt = nzr; 3683 A->preallocated = PETSC_TRUE; 3684 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3685 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3686 } else { 3687 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3688 } 3689 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3690 3691 /* We want to allocate the CUSPARSE struct for matvec now. 3692 The code is so convoluted now that I prefer to copy zeros */ 3693 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3694 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3695 A->offloadmask = PETSC_OFFLOAD_CPU; 3696 A->nonzerostate++; 3697 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3698 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3699 3700 A->assembled = PETSC_FALSE; 3701 A->was_assembled = PETSC_FALSE; 3702 PetscFunctionReturn(0); 3703 } 3704 3705 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 3706 { 3707 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3708 CsrMatrix *csr; 3709 PetscErrorCode ierr; 3710 3711 PetscFunctionBegin; 3712 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3713 PetscValidPointer(a,2); 3714 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3715 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3716 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3717 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3718 csr = (CsrMatrix*)cusp->mat->mat; 3719 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3720 *a = csr->values->data().get(); 3721 PetscFunctionReturn(0); 3722 } 3723 3724 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 3725 { 3726 PetscFunctionBegin; 3727 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3728 PetscValidPointer(a,2); 3729 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3730 *a = NULL; 3731 PetscFunctionReturn(0); 3732 } 3733 3734 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 3735 { 3736 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3737 CsrMatrix *csr; 3738 PetscErrorCode ierr; 3739 3740 PetscFunctionBegin; 3741 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3742 PetscValidPointer(a,2); 3743 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3744 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3745 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3746 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3747 csr = (CsrMatrix*)cusp->mat->mat; 3748 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3749 *a = csr->values->data().get(); 3750 A->offloadmask = PETSC_OFFLOAD_GPU; 3751 PetscFunctionReturn(0); 3752 } 3753 3754 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 3755 { 3756 PetscErrorCode ierr; 3757 3758 PetscFunctionBegin; 3759 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3760 PetscValidPointer(a,2); 3761 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3762 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3763 *a = NULL; 3764 PetscFunctionReturn(0); 3765 } 3766 3767 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 3768 { 3769 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3770 CsrMatrix *csr; 3771 3772 PetscFunctionBegin; 3773 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3774 PetscValidPointer(a,2); 3775 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3776 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3777 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3778 csr = (CsrMatrix*)cusp->mat->mat; 3779 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3780 *a = csr->values->data().get(); 3781 A->offloadmask = PETSC_OFFLOAD_GPU; 3782 PetscFunctionReturn(0); 3783 } 3784 3785 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 3786 { 3787 PetscErrorCode ierr; 3788 3789 PetscFunctionBegin; 3790 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3791 PetscValidPointer(a,2); 3792 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3793 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3794 *a = NULL; 3795 PetscFunctionReturn(0); 3796 } 3797 3798 struct IJCompare4 3799 { 3800 __host__ __device__ 3801 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 3802 { 3803 if (t1.get<0>() < t2.get<0>()) return true; 3804 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3805 return false; 3806 } 3807 }; 3808 3809 struct Shift 3810 { 3811 int _shift; 3812 3813 Shift(int shift) : _shift(shift) {} 3814 __host__ __device__ 3815 inline int operator() (const int &c) 3816 { 3817 return c + _shift; 3818 } 3819 }; 3820 3821 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */ 3822 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 3823 { 3824 PetscErrorCode ierr; 3825 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 3826 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 3827 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 3828 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3829 PetscInt Annz,Bnnz; 3830 cusparseStatus_t stat; 3831 PetscInt i,m,n,zero = 0; 3832 cudaError_t cerr; 3833 3834 PetscFunctionBegin; 3835 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3836 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 3837 PetscValidPointer(C,4); 3838 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3839 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 3840 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 3841 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 3842 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3843 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3844 if (reuse == MAT_INITIAL_MATRIX) { 3845 m = A->rmap->n; 3846 n = A->cmap->n + B->cmap->n; 3847 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 3848 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 3849 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3850 c = (Mat_SeqAIJ*)(*C)->data; 3851 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3852 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3853 Ccsr = new CsrMatrix; 3854 Cmat->cprowIndices = NULL; 3855 c->compressedrow.use = PETSC_FALSE; 3856 c->compressedrow.nrows = 0; 3857 c->compressedrow.i = NULL; 3858 c->compressedrow.rindex = NULL; 3859 Ccusp->workVector = NULL; 3860 Ccusp->nrows = m; 3861 Ccusp->mat = Cmat; 3862 Ccusp->mat->mat = Ccsr; 3863 Ccsr->num_rows = m; 3864 Ccsr->num_cols = n; 3865 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 3866 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3867 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3868 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3869 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3870 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3871 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3872 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3873 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3874 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3875 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 3876 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 3877 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr); 3878 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3879 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3880 3881 Acsr = (CsrMatrix*)Acusp->mat->mat; 3882 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 3883 Annz = (PetscInt)Acsr->column_indices->size(); 3884 Bnnz = (PetscInt)Bcsr->column_indices->size(); 3885 c->nz = Annz + Bnnz; 3886 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 3887 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3888 Ccsr->values = new THRUSTARRAY(c->nz); 3889 Ccsr->num_entries = c->nz; 3890 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 3891 if (c->nz) { 3892 auto Acoo = new THRUSTINTARRAY32(Annz); 3893 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 3894 auto Ccoo = new THRUSTINTARRAY32(c->nz); 3895 THRUSTINTARRAY32 *Aroff,*Broff; 3896 3897 if (a->compressedrow.use) { /* need full row offset */ 3898 if (!Acusp->rowoffsets_gpu) { 3899 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 3900 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 3901 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3902 } 3903 Aroff = Acusp->rowoffsets_gpu; 3904 } else Aroff = Acsr->row_offsets; 3905 if (b->compressedrow.use) { /* need full row offset */ 3906 if (!Bcusp->rowoffsets_gpu) { 3907 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3908 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 3909 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3910 } 3911 Broff = Bcusp->rowoffsets_gpu; 3912 } else Broff = Bcsr->row_offsets; 3913 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3914 stat = cusparseXcsr2coo(Acusp->handle, 3915 Aroff->data().get(), 3916 Annz, 3917 m, 3918 Acoo->data().get(), 3919 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3920 stat = cusparseXcsr2coo(Bcusp->handle, 3921 Broff->data().get(), 3922 Bnnz, 3923 m, 3924 Bcoo->data().get(), 3925 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3926 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 3927 auto Aperm = thrust::make_constant_iterator(1); 3928 auto Bperm = thrust::make_constant_iterator(0); 3929 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 3930 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 3931 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 3932 #else 3933 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 3934 auto Bcib = Bcsr->column_indices->begin(); 3935 auto Bcie = Bcsr->column_indices->end(); 3936 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 3937 #endif 3938 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 3939 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 3940 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 3941 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 3942 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 3943 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 3944 auto p1 = Ccusp->cooPerm->begin(); 3945 auto p2 = Ccusp->cooPerm->begin(); 3946 thrust::advance(p2,Annz); 3947 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 3948 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 3949 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 3950 #endif 3951 auto cci = thrust::make_counting_iterator(zero); 3952 auto cce = thrust::make_counting_iterator(c->nz); 3953 #if 0 //Errors on SUMMIT cuda 11.1.0 3954 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 3955 #else 3956 auto pred = thrust::identity<int>(); 3957 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 3958 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 3959 #endif 3960 stat = cusparseXcoo2csr(Ccusp->handle, 3961 Ccoo->data().get(), 3962 c->nz, 3963 m, 3964 Ccsr->row_offsets->data().get(), 3965 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3966 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3967 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3968 delete wPerm; 3969 delete Acoo; 3970 delete Bcoo; 3971 delete Ccoo; 3972 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3973 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 3974 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 3975 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 3976 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 3977 #endif 3978 if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */ 3979 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 3980 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 3981 CsrMatrix *CcsrT = new CsrMatrix; 3982 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 3983 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 3984 3985 Ccusp->transgen = PETSC_TRUE; 3986 CmatT->cprowIndices = NULL; 3987 CmatT->mat = CcsrT; 3988 CcsrT->num_rows = n; 3989 CcsrT->num_cols = m; 3990 CcsrT->num_entries = c->nz; 3991 3992 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 3993 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 3994 CcsrT->values = new THRUSTARRAY(c->nz); 3995 3996 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3997 auto rT = CcsrT->row_offsets->begin(); 3998 if (AT) { 3999 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4000 thrust::advance(rT,-1); 4001 } 4002 if (BT) { 4003 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4004 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4005 thrust::copy(titb,tite,rT); 4006 } 4007 auto cT = CcsrT->column_indices->begin(); 4008 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4009 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4010 auto vT = CcsrT->values->begin(); 4011 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4012 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4013 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4014 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4015 4016 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4017 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4018 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4019 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4020 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4021 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4022 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4023 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4024 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4025 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4026 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4027 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4028 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4029 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4030 #endif 4031 Ccusp->matTranspose = CmatT; 4032 } 4033 } 4034 4035 c->singlemalloc = PETSC_FALSE; 4036 c->free_a = PETSC_TRUE; 4037 c->free_ij = PETSC_TRUE; 4038 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4039 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4040 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4041 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4042 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4043 ii = *Ccsr->row_offsets; 4044 jj = *Ccsr->column_indices; 4045 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4046 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4047 } else { 4048 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4049 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4050 } 4051 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4052 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4053 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4054 c->maxnz = c->nz; 4055 c->nonzerorowcnt = 0; 4056 c->rmax = 0; 4057 for (i = 0; i < m; i++) { 4058 const PetscInt nn = c->i[i+1] - c->i[i]; 4059 c->ilen[i] = c->imax[i] = nn; 4060 c->nonzerorowcnt += (PetscInt)!!nn; 4061 c->rmax = PetscMax(c->rmax,nn); 4062 } 4063 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4064 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4065 (*C)->nonzerostate++; 4066 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4067 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4068 Ccusp->nonzerostate = (*C)->nonzerostate; 4069 (*C)->preallocated = PETSC_TRUE; 4070 } else { 4071 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4072 c = (Mat_SeqAIJ*)(*C)->data; 4073 if (c->nz) { 4074 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4075 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4076 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4077 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4078 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4079 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4080 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4081 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4082 Acsr = (CsrMatrix*)Acusp->mat->mat; 4083 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4084 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4085 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4086 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4087 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4088 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4089 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4090 auto pmid = Ccusp->cooPerm->begin(); 4091 thrust::advance(pmid,Acsr->num_entries); 4092 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4093 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4094 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4095 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4096 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4097 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4098 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4099 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4100 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4101 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4102 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4103 if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) { 4104 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4105 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4106 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4107 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4108 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4109 auto vT = CcsrT->values->begin(); 4110 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4111 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4112 } 4113 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4114 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4115 } 4116 } 4117 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4118 (*C)->assembled = PETSC_TRUE; 4119 (*C)->was_assembled = PETSC_FALSE; 4120 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4121 PetscFunctionReturn(0); 4122 } 4123 4124 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4125 { 4126 PetscErrorCode ierr; 4127 bool dmem; 4128 const PetscScalar *av; 4129 cudaError_t cerr; 4130 4131 PetscFunctionBegin; 4132 dmem = isCudaMem(v); 4133 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4134 if (n && idx) { 4135 THRUSTINTARRAY widx(n); 4136 widx.assign(idx,idx+n); 4137 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4138 4139 THRUSTARRAY *w = NULL; 4140 thrust::device_ptr<PetscScalar> dv; 4141 if (dmem) { 4142 dv = thrust::device_pointer_cast(v); 4143 } else { 4144 w = new THRUSTARRAY(n); 4145 dv = w->data(); 4146 } 4147 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4148 4149 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4150 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4151 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4152 if (w) { 4153 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4154 } 4155 delete w; 4156 } else { 4157 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4158 } 4159 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4160 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4161 PetscFunctionReturn(0); 4162 } 4163