1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 76 77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 81 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 82 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 83 84 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 85 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 86 87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 89 90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 91 92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 93 { 94 cusparseStatus_t stat; 95 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 96 97 PetscFunctionBegin; 98 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 99 cusparsestruct->stream = stream; 100 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 101 PetscFunctionReturn(0); 102 } 103 104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 105 { 106 cusparseStatus_t stat; 107 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 108 109 PetscFunctionBegin; 110 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 111 if (cusparsestruct->handle != handle) { 112 if (cusparsestruct->handle) { 113 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 114 } 115 cusparsestruct->handle = handle; 116 } 117 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 118 PetscFunctionReturn(0); 119 } 120 121 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 122 { 123 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 124 PetscBool flg; 125 PetscErrorCode ierr; 126 127 PetscFunctionBegin; 128 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 129 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 130 if (cusparsestruct->handle) cusparsestruct->handle = 0; 131 PetscFunctionReturn(0); 132 } 133 134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 135 { 136 PetscFunctionBegin; 137 *type = MATSOLVERCUSPARSE; 138 PetscFunctionReturn(0); 139 } 140 141 /*MC 142 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 143 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 144 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 145 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 146 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 147 algorithms are not recommended. This class does NOT support direct solver operations. 148 149 Level: beginner 150 151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 152 M*/ 153 154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 155 { 156 PetscErrorCode ierr; 157 PetscInt n = A->rmap->n; 158 159 PetscFunctionBegin; 160 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 161 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 162 (*B)->factortype = ftype; 163 (*B)->useordering = PETSC_TRUE; 164 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 165 166 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 167 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 168 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 169 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 170 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 171 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 172 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 173 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 174 175 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 176 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 177 PetscFunctionReturn(0); 178 } 179 180 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 181 { 182 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 183 184 PetscFunctionBegin; 185 switch (op) { 186 case MAT_CUSPARSE_MULT: 187 cusparsestruct->format = format; 188 break; 189 case MAT_CUSPARSE_ALL: 190 cusparsestruct->format = format; 191 break; 192 default: 193 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 194 } 195 PetscFunctionReturn(0); 196 } 197 198 /*@ 199 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 200 operation. Only the MatMult operation can use different GPU storage formats 201 for MPIAIJCUSPARSE matrices. 202 Not Collective 203 204 Input Parameters: 205 + A - Matrix of type SEQAIJCUSPARSE 206 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 207 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 208 209 Output Parameter: 210 211 Level: intermediate 212 213 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 214 @*/ 215 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 216 { 217 PetscErrorCode ierr; 218 219 PetscFunctionBegin; 220 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 221 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 222 PetscFunctionReturn(0); 223 } 224 225 /*@ 226 MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose 227 228 Collective on mat 229 230 Input Parameters: 231 + A - Matrix of type SEQAIJCUSPARSE 232 - transgen - the boolean flag 233 234 Level: intermediate 235 236 .seealso: MATSEQAIJCUSPARSE, MatAIJCUSPARSESetGenerateTranspose() 237 @*/ 238 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 239 { 240 PetscErrorCode ierr; 241 PetscBool flg; 242 243 PetscFunctionBegin; 244 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 245 ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 246 if (flg) { 247 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 248 249 if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 250 cusp->transgen = transgen; 251 if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 252 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 253 } 254 } 255 PetscFunctionReturn(0); 256 } 257 258 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 259 { 260 PetscErrorCode ierr; 261 MatCUSPARSEStorageFormat format; 262 PetscBool flg; 263 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 264 265 PetscFunctionBegin; 266 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 267 if (A->factortype == MAT_FACTOR_NONE) { 268 PetscBool transgen = cusparsestruct->transgen; 269 270 ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 271 if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);} 272 273 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 274 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 275 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 276 277 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 278 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 279 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 280 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 281 cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 282 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 283 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 284 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 285 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 286 287 cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 288 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 289 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 290 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 291 292 cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 293 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 294 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 295 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 296 #endif 297 } 298 ierr = PetscOptionsTail();CHKERRQ(ierr); 299 PetscFunctionReturn(0); 300 } 301 302 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 303 { 304 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 305 PetscErrorCode ierr; 306 307 PetscFunctionBegin; 308 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 309 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 310 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 311 PetscFunctionReturn(0); 312 } 313 314 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 315 { 316 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 317 PetscErrorCode ierr; 318 319 PetscFunctionBegin; 320 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 321 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 322 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 323 PetscFunctionReturn(0); 324 } 325 326 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 327 { 328 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 329 PetscErrorCode ierr; 330 331 PetscFunctionBegin; 332 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 333 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 334 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 335 PetscFunctionReturn(0); 336 } 337 338 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 339 { 340 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 341 PetscErrorCode ierr; 342 343 PetscFunctionBegin; 344 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 345 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 346 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 347 PetscFunctionReturn(0); 348 } 349 350 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 351 { 352 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 353 PetscInt n = A->rmap->n; 354 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 355 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 356 cusparseStatus_t stat; 357 const PetscInt *ai = a->i,*aj = a->j,*vi; 358 const MatScalar *aa = a->a,*v; 359 PetscInt *AiLo, *AjLo; 360 PetscInt i,nz, nzLower, offset, rowOffset; 361 PetscErrorCode ierr; 362 cudaError_t cerr; 363 364 PetscFunctionBegin; 365 if (!n) PetscFunctionReturn(0); 366 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 367 try { 368 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 369 nzLower=n+ai[n]-ai[1]; 370 if (!loTriFactor) { 371 PetscScalar *AALo; 372 373 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 374 375 /* Allocate Space for the lower triangular matrix */ 376 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 377 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 378 379 /* Fill the lower triangular matrix */ 380 AiLo[0] = (PetscInt) 0; 381 AiLo[n] = nzLower; 382 AjLo[0] = (PetscInt) 0; 383 AALo[0] = (MatScalar) 1.0; 384 v = aa; 385 vi = aj; 386 offset = 1; 387 rowOffset= 1; 388 for (i=1; i<n; i++) { 389 nz = ai[i+1] - ai[i]; 390 /* additional 1 for the term on the diagonal */ 391 AiLo[i] = rowOffset; 392 rowOffset += nz+1; 393 394 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 395 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 396 397 offset += nz; 398 AjLo[offset] = (PetscInt) i; 399 AALo[offset] = (MatScalar) 1.0; 400 offset += 1; 401 402 v += nz; 403 vi += nz; 404 } 405 406 /* allocate space for the triangular factor information */ 407 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 408 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 409 /* Create the matrix description */ 410 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 411 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 412 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 413 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 414 #else 415 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 416 #endif 417 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 418 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 419 420 /* set the operation */ 421 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 422 423 /* set the matrix */ 424 loTriFactor->csrMat = new CsrMatrix; 425 loTriFactor->csrMat->num_rows = n; 426 loTriFactor->csrMat->num_cols = n; 427 loTriFactor->csrMat->num_entries = nzLower; 428 429 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 430 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 431 432 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 433 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 434 435 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 436 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 437 438 /* Create the solve analysis information */ 439 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 440 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 441 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 442 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 443 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 444 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 445 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 446 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 447 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 448 #endif 449 450 /* perform the solve analysis */ 451 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 452 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 453 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 454 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 455 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 456 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 457 #endif 458 );CHKERRCUSPARSE(stat); 459 cerr = WaitForCUDA();CHKERRCUDA(cerr); 460 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 461 462 /* assign the pointer */ 463 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 464 loTriFactor->AA_h = AALo; 465 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 466 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 467 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 468 } else { /* update values only */ 469 if (!loTriFactor->AA_h) { 470 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 471 } 472 /* Fill the lower triangular matrix */ 473 loTriFactor->AA_h[0] = 1.0; 474 v = aa; 475 vi = aj; 476 offset = 1; 477 for (i=1; i<n; i++) { 478 nz = ai[i+1] - ai[i]; 479 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 480 offset += nz; 481 loTriFactor->AA_h[offset] = 1.0; 482 offset += 1; 483 v += nz; 484 } 485 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 486 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 487 } 488 } catch(char *ex) { 489 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 490 } 491 } 492 PetscFunctionReturn(0); 493 } 494 495 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 496 { 497 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 498 PetscInt n = A->rmap->n; 499 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 500 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 501 cusparseStatus_t stat; 502 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 503 const MatScalar *aa = a->a,*v; 504 PetscInt *AiUp, *AjUp; 505 PetscInt i,nz, nzUpper, offset; 506 PetscErrorCode ierr; 507 cudaError_t cerr; 508 509 PetscFunctionBegin; 510 if (!n) PetscFunctionReturn(0); 511 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 512 try { 513 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 514 nzUpper = adiag[0]-adiag[n]; 515 if (!upTriFactor) { 516 PetscScalar *AAUp; 517 518 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 519 520 /* Allocate Space for the upper triangular matrix */ 521 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 522 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 523 524 /* Fill the upper triangular matrix */ 525 AiUp[0]=(PetscInt) 0; 526 AiUp[n]=nzUpper; 527 offset = nzUpper; 528 for (i=n-1; i>=0; i--) { 529 v = aa + adiag[i+1] + 1; 530 vi = aj + adiag[i+1] + 1; 531 532 /* number of elements NOT on the diagonal */ 533 nz = adiag[i] - adiag[i+1]-1; 534 535 /* decrement the offset */ 536 offset -= (nz+1); 537 538 /* first, set the diagonal elements */ 539 AjUp[offset] = (PetscInt) i; 540 AAUp[offset] = (MatScalar)1./v[nz]; 541 AiUp[i] = AiUp[i+1] - (nz+1); 542 543 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 544 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 545 } 546 547 /* allocate space for the triangular factor information */ 548 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 549 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 550 551 /* Create the matrix description */ 552 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 553 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 554 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 555 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 556 #else 557 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 558 #endif 559 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 560 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 561 562 /* set the operation */ 563 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 564 565 /* set the matrix */ 566 upTriFactor->csrMat = new CsrMatrix; 567 upTriFactor->csrMat->num_rows = n; 568 upTriFactor->csrMat->num_cols = n; 569 upTriFactor->csrMat->num_entries = nzUpper; 570 571 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 572 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 573 574 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 575 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 576 577 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 578 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 579 580 /* Create the solve analysis information */ 581 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 582 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 583 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 584 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 585 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 586 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 587 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 588 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 589 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 590 #endif 591 592 /* perform the solve analysis */ 593 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 594 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 595 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 596 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 597 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 598 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 599 #endif 600 );CHKERRCUSPARSE(stat); 601 cerr = WaitForCUDA();CHKERRCUDA(cerr); 602 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 603 604 /* assign the pointer */ 605 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 606 upTriFactor->AA_h = AAUp; 607 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 608 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 609 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 610 } else { 611 if (!upTriFactor->AA_h) { 612 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 613 } 614 /* Fill the upper triangular matrix */ 615 offset = nzUpper; 616 for (i=n-1; i>=0; i--) { 617 v = aa + adiag[i+1] + 1; 618 619 /* number of elements NOT on the diagonal */ 620 nz = adiag[i] - adiag[i+1]-1; 621 622 /* decrement the offset */ 623 offset -= (nz+1); 624 625 /* first, set the diagonal elements */ 626 upTriFactor->AA_h[offset] = 1./v[nz]; 627 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 628 } 629 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 630 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 631 } 632 } catch(char *ex) { 633 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 634 } 635 } 636 PetscFunctionReturn(0); 637 } 638 639 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 640 { 641 PetscErrorCode ierr; 642 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 643 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 644 IS isrow = a->row,iscol = a->icol; 645 PetscBool row_identity,col_identity; 646 PetscInt n = A->rmap->n; 647 648 PetscFunctionBegin; 649 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 650 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 651 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 652 653 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 654 cusparseTriFactors->nnz=a->nz; 655 656 A->offloadmask = PETSC_OFFLOAD_BOTH; 657 /* lower triangular indices */ 658 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 659 if (!row_identity && !cusparseTriFactors->rpermIndices) { 660 const PetscInt *r; 661 662 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 663 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 664 cusparseTriFactors->rpermIndices->assign(r, r+n); 665 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 666 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 667 } 668 669 /* upper triangular indices */ 670 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 671 if (!col_identity && !cusparseTriFactors->cpermIndices) { 672 const PetscInt *c; 673 674 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 675 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 676 cusparseTriFactors->cpermIndices->assign(c, c+n); 677 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 678 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 679 } 680 PetscFunctionReturn(0); 681 } 682 683 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 684 { 685 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 686 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 687 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 688 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 689 cusparseStatus_t stat; 690 PetscErrorCode ierr; 691 cudaError_t cerr; 692 PetscInt *AiUp, *AjUp; 693 PetscScalar *AAUp; 694 PetscScalar *AALo; 695 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 696 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 697 const PetscInt *ai = b->i,*aj = b->j,*vj; 698 const MatScalar *aa = b->a,*v; 699 700 PetscFunctionBegin; 701 if (!n) PetscFunctionReturn(0); 702 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 703 try { 704 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 705 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 706 if (!upTriFactor && !loTriFactor) { 707 /* Allocate Space for the upper triangular matrix */ 708 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 709 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 710 711 /* Fill the upper triangular matrix */ 712 AiUp[0]=(PetscInt) 0; 713 AiUp[n]=nzUpper; 714 offset = 0; 715 for (i=0; i<n; i++) { 716 /* set the pointers */ 717 v = aa + ai[i]; 718 vj = aj + ai[i]; 719 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 720 721 /* first, set the diagonal elements */ 722 AjUp[offset] = (PetscInt) i; 723 AAUp[offset] = (MatScalar)1.0/v[nz]; 724 AiUp[i] = offset; 725 AALo[offset] = (MatScalar)1.0/v[nz]; 726 727 offset+=1; 728 if (nz>0) { 729 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 730 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 731 for (j=offset; j<offset+nz; j++) { 732 AAUp[j] = -AAUp[j]; 733 AALo[j] = AAUp[j]/v[nz]; 734 } 735 offset+=nz; 736 } 737 } 738 739 /* allocate space for the triangular factor information */ 740 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 741 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 742 743 /* Create the matrix description */ 744 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 745 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 746 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 747 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 748 #else 749 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 750 #endif 751 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 752 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 753 754 /* set the matrix */ 755 upTriFactor->csrMat = new CsrMatrix; 756 upTriFactor->csrMat->num_rows = A->rmap->n; 757 upTriFactor->csrMat->num_cols = A->cmap->n; 758 upTriFactor->csrMat->num_entries = a->nz; 759 760 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 761 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 762 763 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 764 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 765 766 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 767 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 768 769 /* set the operation */ 770 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 771 772 /* Create the solve analysis information */ 773 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 774 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 775 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 776 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 777 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 778 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 779 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 780 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 781 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 782 #endif 783 784 /* perform the solve analysis */ 785 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 786 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 787 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 788 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 789 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 790 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 791 #endif 792 );CHKERRCUSPARSE(stat); 793 cerr = WaitForCUDA();CHKERRCUDA(cerr); 794 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 795 796 /* assign the pointer */ 797 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 798 799 /* allocate space for the triangular factor information */ 800 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 801 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 802 803 /* Create the matrix description */ 804 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 805 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 806 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 807 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 808 #else 809 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 810 #endif 811 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 812 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 813 814 /* set the operation */ 815 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 816 817 /* set the matrix */ 818 loTriFactor->csrMat = new CsrMatrix; 819 loTriFactor->csrMat->num_rows = A->rmap->n; 820 loTriFactor->csrMat->num_cols = A->cmap->n; 821 loTriFactor->csrMat->num_entries = a->nz; 822 823 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 824 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 825 826 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 827 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 828 829 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 830 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 831 832 /* Create the solve analysis information */ 833 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 834 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 835 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 836 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 837 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 838 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 839 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 840 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 841 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 842 #endif 843 844 /* perform the solve analysis */ 845 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 846 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 847 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 848 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 849 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 850 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 851 #endif 852 );CHKERRCUSPARSE(stat); 853 cerr = WaitForCUDA();CHKERRCUDA(cerr); 854 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 855 856 /* assign the pointer */ 857 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 858 859 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 860 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 861 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 862 } else { 863 /* Fill the upper triangular matrix */ 864 offset = 0; 865 for (i=0; i<n; i++) { 866 /* set the pointers */ 867 v = aa + ai[i]; 868 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 869 870 /* first, set the diagonal elements */ 871 AAUp[offset] = 1.0/v[nz]; 872 AALo[offset] = 1.0/v[nz]; 873 874 offset+=1; 875 if (nz>0) { 876 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 877 for (j=offset; j<offset+nz; j++) { 878 AAUp[j] = -AAUp[j]; 879 AALo[j] = AAUp[j]/v[nz]; 880 } 881 offset+=nz; 882 } 883 } 884 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 885 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 886 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 887 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 888 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 889 } 890 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 891 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 892 } catch(char *ex) { 893 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 894 } 895 } 896 PetscFunctionReturn(0); 897 } 898 899 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 900 { 901 PetscErrorCode ierr; 902 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 903 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 904 IS ip = a->row; 905 PetscBool perm_identity; 906 PetscInt n = A->rmap->n; 907 908 PetscFunctionBegin; 909 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 910 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 911 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 912 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 913 914 A->offloadmask = PETSC_OFFLOAD_BOTH; 915 916 /* lower triangular indices */ 917 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 918 if (!perm_identity) { 919 IS iip; 920 const PetscInt *irip,*rip; 921 922 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 923 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 924 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 925 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 926 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 927 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 928 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 929 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 930 ierr = ISDestroy(&iip);CHKERRQ(ierr); 931 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 932 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 933 } 934 PetscFunctionReturn(0); 935 } 936 937 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 938 { 939 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 940 IS isrow = b->row,iscol = b->col; 941 PetscBool row_identity,col_identity; 942 PetscErrorCode ierr; 943 944 PetscFunctionBegin; 945 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 946 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 947 B->offloadmask = PETSC_OFFLOAD_CPU; 948 /* determine which version of MatSolve needs to be used. */ 949 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 950 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 951 if (row_identity && col_identity) { 952 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 953 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 954 B->ops->matsolve = NULL; 955 B->ops->matsolvetranspose = NULL; 956 } else { 957 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 958 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 959 B->ops->matsolve = NULL; 960 B->ops->matsolvetranspose = NULL; 961 } 962 963 /* get the triangular factors */ 964 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 965 PetscFunctionReturn(0); 966 } 967 968 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 969 { 970 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 971 IS ip = b->row; 972 PetscBool perm_identity; 973 PetscErrorCode ierr; 974 975 PetscFunctionBegin; 976 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 977 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 978 B->offloadmask = PETSC_OFFLOAD_CPU; 979 /* determine which version of MatSolve needs to be used. */ 980 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 981 if (perm_identity) { 982 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 983 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 984 B->ops->matsolve = NULL; 985 B->ops->matsolvetranspose = NULL; 986 } else { 987 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 988 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 989 B->ops->matsolve = NULL; 990 B->ops->matsolvetranspose = NULL; 991 } 992 993 /* get the triangular factors */ 994 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 995 PetscFunctionReturn(0); 996 } 997 998 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 999 { 1000 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1001 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1002 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1003 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1004 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1005 cusparseStatus_t stat; 1006 cusparseIndexBase_t indexBase; 1007 cusparseMatrixType_t matrixType; 1008 cusparseFillMode_t fillMode; 1009 cusparseDiagType_t diagType; 1010 cudaError_t cerr; 1011 PetscErrorCode ierr; 1012 1013 PetscFunctionBegin; 1014 /* allocate space for the transpose of the lower triangular factor */ 1015 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1016 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1017 1018 /* set the matrix descriptors of the lower triangular factor */ 1019 matrixType = cusparseGetMatType(loTriFactor->descr); 1020 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1021 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1022 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1023 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1024 1025 /* Create the matrix description */ 1026 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1027 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1028 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1029 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1030 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1031 1032 /* set the operation */ 1033 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1034 1035 /* allocate GPU space for the CSC of the lower triangular factor*/ 1036 loTriFactorT->csrMat = new CsrMatrix; 1037 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1038 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1039 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1040 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1041 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1042 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1043 1044 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1045 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1046 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1047 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1048 loTriFactor->csrMat->values->data().get(), 1049 loTriFactor->csrMat->row_offsets->data().get(), 1050 loTriFactor->csrMat->column_indices->data().get(), 1051 loTriFactorT->csrMat->values->data().get(), 1052 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1053 CUSPARSE_ACTION_NUMERIC,indexBase, 1054 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1055 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1056 #endif 1057 1058 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1059 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1060 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1061 loTriFactor->csrMat->values->data().get(), 1062 loTriFactor->csrMat->row_offsets->data().get(), 1063 loTriFactor->csrMat->column_indices->data().get(), 1064 loTriFactorT->csrMat->values->data().get(), 1065 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1066 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1067 CUSPARSE_ACTION_NUMERIC, indexBase, 1068 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 1069 #else 1070 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1071 CUSPARSE_ACTION_NUMERIC, indexBase 1072 #endif 1073 );CHKERRCUSPARSE(stat); 1074 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1075 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1076 1077 /* Create the solve analysis information */ 1078 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1079 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1080 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1081 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1082 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1083 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1084 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1085 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1086 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1087 #endif 1088 1089 /* perform the solve analysis */ 1090 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1091 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1092 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1093 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 1094 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1095 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1096 #endif 1097 );CHKERRCUSPARSE(stat); 1098 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1099 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1100 1101 /* assign the pointer */ 1102 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1103 1104 /*********************************************/ 1105 /* Now the Transpose of the Upper Tri Factor */ 1106 /*********************************************/ 1107 1108 /* allocate space for the transpose of the upper triangular factor */ 1109 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1110 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1111 1112 /* set the matrix descriptors of the upper triangular factor */ 1113 matrixType = cusparseGetMatType(upTriFactor->descr); 1114 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1115 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1116 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1117 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1118 1119 /* Create the matrix description */ 1120 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1121 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1122 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1123 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1124 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1125 1126 /* set the operation */ 1127 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1128 1129 /* allocate GPU space for the CSC of the upper triangular factor*/ 1130 upTriFactorT->csrMat = new CsrMatrix; 1131 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1132 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1133 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1134 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1135 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1136 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1137 1138 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1139 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1140 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1141 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1142 upTriFactor->csrMat->values->data().get(), 1143 upTriFactor->csrMat->row_offsets->data().get(), 1144 upTriFactor->csrMat->column_indices->data().get(), 1145 upTriFactorT->csrMat->values->data().get(), 1146 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1147 CUSPARSE_ACTION_NUMERIC,indexBase, 1148 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1149 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1150 #endif 1151 1152 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1153 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1154 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1155 upTriFactor->csrMat->values->data().get(), 1156 upTriFactor->csrMat->row_offsets->data().get(), 1157 upTriFactor->csrMat->column_indices->data().get(), 1158 upTriFactorT->csrMat->values->data().get(), 1159 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1160 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1161 CUSPARSE_ACTION_NUMERIC, indexBase, 1162 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1163 #else 1164 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1165 CUSPARSE_ACTION_NUMERIC, indexBase 1166 #endif 1167 );CHKERRCUSPARSE(stat); 1168 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1169 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1170 1171 /* Create the solve analysis information */ 1172 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1173 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1174 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1175 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1176 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1177 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1178 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1179 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1180 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1181 #endif 1182 1183 /* perform the solve analysis */ 1184 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1185 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1186 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1187 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1188 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1189 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1190 #endif 1191 );CHKERRCUSPARSE(stat); 1192 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1193 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1194 1195 /* assign the pointer */ 1196 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1197 PetscFunctionReturn(0); 1198 } 1199 1200 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 1201 { 1202 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1203 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1204 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1205 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1206 cusparseStatus_t stat; 1207 cusparseIndexBase_t indexBase; 1208 cudaError_t err; 1209 PetscErrorCode ierr; 1210 1211 PetscFunctionBegin; 1212 if (!cusparsestruct->transgen || cusparsestruct->matTranspose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 1213 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1214 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1215 /* create cusparse matrix */ 1216 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1217 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1218 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1219 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1220 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1221 1222 /* set alpha and beta */ 1223 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1224 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1225 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1226 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1227 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1228 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1229 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1230 1231 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1232 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1233 CsrMatrix *matrixT= new CsrMatrix; 1234 matrixT->num_rows = A->cmap->n; 1235 matrixT->num_cols = A->rmap->n; 1236 matrixT->num_entries = a->nz; 1237 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1238 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1239 matrixT->values = new THRUSTARRAY(a->nz); 1240 1241 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1242 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1243 1244 /* compute the transpose, i.e. the CSC */ 1245 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1246 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1247 A->cmap->n, matrix->num_entries, 1248 matrix->values->data().get(), 1249 cusparsestruct->rowoffsets_gpu->data().get(), 1250 matrix->column_indices->data().get(), 1251 matrixT->values->data().get(), 1252 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1253 CUSPARSE_ACTION_NUMERIC,indexBase, 1254 cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1255 err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err); 1256 #endif 1257 1258 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1259 A->cmap->n, matrix->num_entries, 1260 matrix->values->data().get(), 1261 cusparsestruct->rowoffsets_gpu->data().get(), 1262 matrix->column_indices->data().get(), 1263 matrixT->values->data().get(), 1264 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1265 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1266 CUSPARSE_ACTION_NUMERIC,indexBase, 1267 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1268 #else 1269 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1270 CUSPARSE_ACTION_NUMERIC, indexBase 1271 #endif 1272 );CHKERRCUSPARSE(stat); 1273 matstructT->mat = matrixT; 1274 1275 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1276 stat = cusparseCreateCsr(&matstructT->matDescr, 1277 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1278 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1279 matrixT->values->data().get(), 1280 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1281 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1282 #endif 1283 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1284 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1285 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1286 #else 1287 CsrMatrix *temp = new CsrMatrix; 1288 CsrMatrix *tempT = new CsrMatrix; 1289 /* First convert HYB to CSR */ 1290 temp->num_rows = A->rmap->n; 1291 temp->num_cols = A->cmap->n; 1292 temp->num_entries = a->nz; 1293 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1294 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1295 temp->values = new THRUSTARRAY(a->nz); 1296 1297 stat = cusparse_hyb2csr(cusparsestruct->handle, 1298 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1299 temp->values->data().get(), 1300 temp->row_offsets->data().get(), 1301 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1302 1303 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1304 tempT->num_rows = A->rmap->n; 1305 tempT->num_cols = A->cmap->n; 1306 tempT->num_entries = a->nz; 1307 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1308 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1309 tempT->values = new THRUSTARRAY(a->nz); 1310 1311 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1312 temp->num_cols, temp->num_entries, 1313 temp->values->data().get(), 1314 temp->row_offsets->data().get(), 1315 temp->column_indices->data().get(), 1316 tempT->values->data().get(), 1317 tempT->column_indices->data().get(), 1318 tempT->row_offsets->data().get(), 1319 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1320 1321 /* Last, convert CSC to HYB */ 1322 cusparseHybMat_t hybMat; 1323 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1324 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1325 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1326 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1327 matstructT->descr, tempT->values->data().get(), 1328 tempT->row_offsets->data().get(), 1329 tempT->column_indices->data().get(), 1330 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1331 1332 /* assign the pointer */ 1333 matstructT->mat = hybMat; 1334 /* delete temporaries */ 1335 if (tempT) { 1336 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1337 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1338 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1339 delete (CsrMatrix*) tempT; 1340 } 1341 if (temp) { 1342 if (temp->values) delete (THRUSTARRAY*) temp->values; 1343 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1344 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1345 delete (CsrMatrix*) temp; 1346 } 1347 #endif 1348 } 1349 err = WaitForCUDA();CHKERRCUDA(err); 1350 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1351 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1352 /* the compressed row indices is not used for matTranspose */ 1353 matstructT->cprowIndices = NULL; 1354 /* assign the pointer */ 1355 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1356 PetscFunctionReturn(0); 1357 } 1358 1359 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1360 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1361 { 1362 PetscInt n = xx->map->n; 1363 const PetscScalar *barray; 1364 PetscScalar *xarray; 1365 thrust::device_ptr<const PetscScalar> bGPU; 1366 thrust::device_ptr<PetscScalar> xGPU; 1367 cusparseStatus_t stat; 1368 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1369 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1370 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1371 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1372 PetscErrorCode ierr; 1373 cudaError_t cerr; 1374 1375 PetscFunctionBegin; 1376 /* Analyze the matrix and create the transpose ... on the fly */ 1377 if (!loTriFactorT && !upTriFactorT) { 1378 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1379 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1380 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1381 } 1382 1383 /* Get the GPU pointers */ 1384 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1385 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1386 xGPU = thrust::device_pointer_cast(xarray); 1387 bGPU = thrust::device_pointer_cast(barray); 1388 1389 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1390 /* First, reorder with the row permutation */ 1391 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1392 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1393 xGPU); 1394 1395 /* First, solve U */ 1396 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1397 upTriFactorT->csrMat->num_rows, 1398 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1399 upTriFactorT->csrMat->num_entries, 1400 #endif 1401 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1402 upTriFactorT->csrMat->values->data().get(), 1403 upTriFactorT->csrMat->row_offsets->data().get(), 1404 upTriFactorT->csrMat->column_indices->data().get(), 1405 upTriFactorT->solveInfo, 1406 xarray, tempGPU->data().get() 1407 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1408 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1409 #endif 1410 );CHKERRCUSPARSE(stat); 1411 1412 /* Then, solve L */ 1413 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1414 loTriFactorT->csrMat->num_rows, 1415 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1416 loTriFactorT->csrMat->num_entries, 1417 #endif 1418 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1419 loTriFactorT->csrMat->values->data().get(), 1420 loTriFactorT->csrMat->row_offsets->data().get(), 1421 loTriFactorT->csrMat->column_indices->data().get(), 1422 loTriFactorT->solveInfo, 1423 tempGPU->data().get(), xarray 1424 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1425 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1426 #endif 1427 );CHKERRCUSPARSE(stat); 1428 1429 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1430 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1431 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1432 tempGPU->begin()); 1433 1434 /* Copy the temporary to the full solution. */ 1435 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1436 1437 /* restore */ 1438 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1439 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1440 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1441 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1442 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1443 PetscFunctionReturn(0); 1444 } 1445 1446 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1447 { 1448 const PetscScalar *barray; 1449 PetscScalar *xarray; 1450 cusparseStatus_t stat; 1451 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1452 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1453 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1454 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1455 PetscErrorCode ierr; 1456 cudaError_t cerr; 1457 1458 PetscFunctionBegin; 1459 /* Analyze the matrix and create the transpose ... on the fly */ 1460 if (!loTriFactorT && !upTriFactorT) { 1461 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1462 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1463 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1464 } 1465 1466 /* Get the GPU pointers */ 1467 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1468 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1469 1470 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1471 /* First, solve U */ 1472 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1473 upTriFactorT->csrMat->num_rows, 1474 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1475 upTriFactorT->csrMat->num_entries, 1476 #endif 1477 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1478 upTriFactorT->csrMat->values->data().get(), 1479 upTriFactorT->csrMat->row_offsets->data().get(), 1480 upTriFactorT->csrMat->column_indices->data().get(), 1481 upTriFactorT->solveInfo, 1482 barray, tempGPU->data().get() 1483 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1484 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1485 #endif 1486 );CHKERRCUSPARSE(stat); 1487 1488 /* Then, solve L */ 1489 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1490 loTriFactorT->csrMat->num_rows, 1491 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1492 loTriFactorT->csrMat->num_entries, 1493 #endif 1494 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1495 loTriFactorT->csrMat->values->data().get(), 1496 loTriFactorT->csrMat->row_offsets->data().get(), 1497 loTriFactorT->csrMat->column_indices->data().get(), 1498 loTriFactorT->solveInfo, 1499 tempGPU->data().get(), xarray 1500 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1501 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1502 #endif 1503 );CHKERRCUSPARSE(stat); 1504 1505 /* restore */ 1506 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1507 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1508 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1509 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1510 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1511 PetscFunctionReturn(0); 1512 } 1513 1514 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1515 { 1516 const PetscScalar *barray; 1517 PetscScalar *xarray; 1518 thrust::device_ptr<const PetscScalar> bGPU; 1519 thrust::device_ptr<PetscScalar> xGPU; 1520 cusparseStatus_t stat; 1521 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1522 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1523 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1524 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1525 PetscErrorCode ierr; 1526 cudaError_t cerr; 1527 1528 PetscFunctionBegin; 1529 1530 /* Get the GPU pointers */ 1531 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1532 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1533 xGPU = thrust::device_pointer_cast(xarray); 1534 bGPU = thrust::device_pointer_cast(barray); 1535 1536 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1537 /* First, reorder with the row permutation */ 1538 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1539 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1540 tempGPU->begin()); 1541 1542 /* Next, solve L */ 1543 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1544 loTriFactor->csrMat->num_rows, 1545 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1546 loTriFactor->csrMat->num_entries, 1547 #endif 1548 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1549 loTriFactor->csrMat->values->data().get(), 1550 loTriFactor->csrMat->row_offsets->data().get(), 1551 loTriFactor->csrMat->column_indices->data().get(), 1552 loTriFactor->solveInfo, 1553 tempGPU->data().get(), xarray 1554 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1555 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1556 #endif 1557 );CHKERRCUSPARSE(stat); 1558 1559 /* Then, solve U */ 1560 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1561 upTriFactor->csrMat->num_rows, 1562 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1563 upTriFactor->csrMat->num_entries, 1564 #endif 1565 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1566 upTriFactor->csrMat->values->data().get(), 1567 upTriFactor->csrMat->row_offsets->data().get(), 1568 upTriFactor->csrMat->column_indices->data().get(), 1569 upTriFactor->solveInfo, 1570 xarray, tempGPU->data().get() 1571 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1572 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1573 #endif 1574 );CHKERRCUSPARSE(stat); 1575 1576 /* Last, reorder with the column permutation */ 1577 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1578 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1579 xGPU); 1580 1581 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1582 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1583 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1584 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1585 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1586 PetscFunctionReturn(0); 1587 } 1588 1589 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1590 { 1591 const PetscScalar *barray; 1592 PetscScalar *xarray; 1593 cusparseStatus_t stat; 1594 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1595 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1596 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1597 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1598 PetscErrorCode ierr; 1599 cudaError_t cerr; 1600 1601 PetscFunctionBegin; 1602 /* Get the GPU pointers */ 1603 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1604 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1605 1606 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1607 /* First, solve L */ 1608 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1609 loTriFactor->csrMat->num_rows, 1610 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1611 loTriFactor->csrMat->num_entries, 1612 #endif 1613 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1614 loTriFactor->csrMat->values->data().get(), 1615 loTriFactor->csrMat->row_offsets->data().get(), 1616 loTriFactor->csrMat->column_indices->data().get(), 1617 loTriFactor->solveInfo, 1618 barray, tempGPU->data().get() 1619 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1620 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1621 #endif 1622 );CHKERRCUSPARSE(stat); 1623 1624 /* Next, solve U */ 1625 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1626 upTriFactor->csrMat->num_rows, 1627 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1628 upTriFactor->csrMat->num_entries, 1629 #endif 1630 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1631 upTriFactor->csrMat->values->data().get(), 1632 upTriFactor->csrMat->row_offsets->data().get(), 1633 upTriFactor->csrMat->column_indices->data().get(), 1634 upTriFactor->solveInfo, 1635 tempGPU->data().get(), xarray 1636 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1637 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1638 #endif 1639 );CHKERRCUSPARSE(stat); 1640 1641 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1642 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1643 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1644 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1645 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1646 PetscFunctionReturn(0); 1647 } 1648 1649 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1650 { 1651 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1652 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1653 cudaError_t cerr; 1654 PetscErrorCode ierr; 1655 1656 PetscFunctionBegin; 1657 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1658 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1659 1660 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1661 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1662 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1663 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1664 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1665 A->offloadmask = PETSC_OFFLOAD_BOTH; 1666 } 1667 PetscFunctionReturn(0); 1668 } 1669 1670 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1671 { 1672 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1673 PetscErrorCode ierr; 1674 1675 PetscFunctionBegin; 1676 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1677 *array = a->a; 1678 A->offloadmask = PETSC_OFFLOAD_CPU; 1679 PetscFunctionReturn(0); 1680 } 1681 1682 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1683 { 1684 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1685 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1686 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1687 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1688 PetscErrorCode ierr; 1689 cusparseStatus_t stat; 1690 PetscBool both = PETSC_TRUE; 1691 cudaError_t err; 1692 1693 PetscFunctionBegin; 1694 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU"); 1695 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1696 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1697 /* Copy values only */ 1698 CsrMatrix *matrix,*matrixT; 1699 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1700 1701 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values"); 1702 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1703 matrix->values->assign(a->a, a->a+a->nz); 1704 err = WaitForCUDA();CHKERRCUDA(err); 1705 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1706 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1707 1708 /* Update matT when it was built before */ 1709 if (cusparsestruct->matTranspose) { 1710 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1711 matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1712 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1713 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1714 A->cmap->n, matrix->num_entries, 1715 matrix->values->data().get(), 1716 cusparsestruct->rowoffsets_gpu->data().get(), 1717 matrix->column_indices->data().get(), 1718 matrixT->values->data().get(), 1719 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1720 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1721 CUSPARSE_ACTION_NUMERIC,indexBase, 1722 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1723 #else 1724 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1725 CUSPARSE_ACTION_NUMERIC, indexBase 1726 #endif 1727 );CHKERRCUSPARSE(stat); 1728 err = WaitForCUDA();CHKERRCUDA(err); 1729 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1730 } 1731 } else { 1732 PetscInt nnz; 1733 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1734 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1735 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1736 delete cusparsestruct->workVector; 1737 delete cusparsestruct->rowoffsets_gpu; 1738 try { 1739 if (a->compressedrow.use) { 1740 m = a->compressedrow.nrows; 1741 ii = a->compressedrow.i; 1742 ridx = a->compressedrow.rindex; 1743 } else { 1744 m = A->rmap->n; 1745 ii = a->i; 1746 ridx = NULL; 1747 } 1748 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data"); 1749 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data"); 1750 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1751 else nnz = a->nz; 1752 1753 /* create cusparse matrix */ 1754 cusparsestruct->nrows = m; 1755 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1756 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1757 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1758 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1759 1760 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1761 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1762 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1763 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1764 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1765 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1766 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1767 1768 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1769 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1770 /* set the matrix */ 1771 CsrMatrix *mat= new CsrMatrix; 1772 mat->num_rows = m; 1773 mat->num_cols = A->cmap->n; 1774 mat->num_entries = nnz; 1775 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1776 mat->row_offsets->assign(ii, ii + m+1); 1777 1778 mat->column_indices = new THRUSTINTARRAY32(nnz); 1779 mat->column_indices->assign(a->j, a->j+nnz); 1780 1781 mat->values = new THRUSTARRAY(nnz); 1782 if (a->a) mat->values->assign(a->a, a->a+nnz); 1783 1784 /* assign the pointer */ 1785 matstruct->mat = mat; 1786 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1787 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1788 stat = cusparseCreateCsr(&matstruct->matDescr, 1789 mat->num_rows, mat->num_cols, mat->num_entries, 1790 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1791 mat->values->data().get(), 1792 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1793 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1794 } 1795 #endif 1796 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1797 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1798 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1799 #else 1800 CsrMatrix *mat= new CsrMatrix; 1801 mat->num_rows = m; 1802 mat->num_cols = A->cmap->n; 1803 mat->num_entries = nnz; 1804 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1805 mat->row_offsets->assign(ii, ii + m+1); 1806 1807 mat->column_indices = new THRUSTINTARRAY32(nnz); 1808 mat->column_indices->assign(a->j, a->j+nnz); 1809 1810 mat->values = new THRUSTARRAY(nnz); 1811 if (a->a) mat->values->assign(a->a, a->a+nnz); 1812 1813 cusparseHybMat_t hybMat; 1814 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1815 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1816 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1817 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1818 matstruct->descr, mat->values->data().get(), 1819 mat->row_offsets->data().get(), 1820 mat->column_indices->data().get(), 1821 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1822 /* assign the pointer */ 1823 matstruct->mat = hybMat; 1824 1825 if (mat) { 1826 if (mat->values) delete (THRUSTARRAY*)mat->values; 1827 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1828 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1829 delete (CsrMatrix*)mat; 1830 } 1831 #endif 1832 } 1833 1834 /* assign the compressed row indices */ 1835 if (a->compressedrow.use) { 1836 cusparsestruct->workVector = new THRUSTARRAY(m); 1837 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1838 matstruct->cprowIndices->assign(ridx,ridx+m); 1839 tmp = m; 1840 } else { 1841 cusparsestruct->workVector = NULL; 1842 matstruct->cprowIndices = NULL; 1843 tmp = 0; 1844 } 1845 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1846 1847 /* assign the pointer */ 1848 cusparsestruct->mat = matstruct; 1849 } catch(char *ex) { 1850 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1851 } 1852 err = WaitForCUDA();CHKERRCUDA(err); 1853 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1854 cusparsestruct->nonzerostate = A->nonzerostate; 1855 } 1856 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1857 } 1858 PetscFunctionReturn(0); 1859 } 1860 1861 struct VecCUDAPlusEquals 1862 { 1863 template <typename Tuple> 1864 __host__ __device__ 1865 void operator()(Tuple t) 1866 { 1867 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1868 } 1869 }; 1870 1871 struct VecCUDAEquals 1872 { 1873 template <typename Tuple> 1874 __host__ __device__ 1875 void operator()(Tuple t) 1876 { 1877 thrust::get<1>(t) = thrust::get<0>(t); 1878 } 1879 }; 1880 1881 struct VecCUDAEqualsReverse 1882 { 1883 template <typename Tuple> 1884 __host__ __device__ 1885 void operator()(Tuple t) 1886 { 1887 thrust::get<0>(t) = thrust::get<1>(t); 1888 } 1889 }; 1890 1891 struct MatMatCusparse { 1892 PetscBool cisdense; 1893 PetscScalar *Bt; 1894 Mat X; 1895 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1896 PetscLogDouble flops; 1897 CsrMatrix *Bcsr; 1898 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1899 cusparseSpMatDescr_t matSpBDescr; 1900 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1901 cusparseDnMatDescr_t matBDescr; 1902 cusparseDnMatDescr_t matCDescr; 1903 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1904 size_t mmBufferSize; 1905 void *mmBuffer; 1906 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1907 cusparseSpGEMMDescr_t spgemmDesc; 1908 #endif 1909 }; 1910 1911 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1912 { 1913 PetscErrorCode ierr; 1914 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1915 cudaError_t cerr; 1916 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1917 cusparseStatus_t stat; 1918 #endif 1919 1920 PetscFunctionBegin; 1921 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1922 delete mmdata->Bcsr; 1923 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1924 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1925 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1926 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1927 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1928 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1929 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1930 #endif 1931 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1932 ierr = PetscFree(data);CHKERRQ(ierr); 1933 PetscFunctionReturn(0); 1934 } 1935 1936 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1937 1938 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1939 { 1940 Mat_Product *product = C->product; 1941 Mat A,B; 1942 PetscInt m,n,blda,clda; 1943 PetscBool flg,biscuda; 1944 Mat_SeqAIJCUSPARSE *cusp; 1945 cusparseStatus_t stat; 1946 cusparseOperation_t opA; 1947 const PetscScalar *barray; 1948 PetscScalar *carray; 1949 PetscErrorCode ierr; 1950 MatMatCusparse *mmdata; 1951 Mat_SeqAIJCUSPARSEMultStruct *mat; 1952 CsrMatrix *csrmat; 1953 cudaError_t cerr; 1954 1955 PetscFunctionBegin; 1956 MatCheckProduct(C,1); 1957 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1958 mmdata = (MatMatCusparse*)product->data; 1959 A = product->A; 1960 B = product->B; 1961 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1962 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1963 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1964 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1965 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1966 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1967 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1968 switch (product->type) { 1969 case MATPRODUCT_AB: 1970 case MATPRODUCT_PtAP: 1971 mat = cusp->mat; 1972 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1973 m = A->rmap->n; 1974 n = B->cmap->n; 1975 break; 1976 case MATPRODUCT_AtB: 1977 if (!cusp->transgen) { 1978 mat = cusp->mat; 1979 opA = CUSPARSE_OPERATION_TRANSPOSE; 1980 } else { 1981 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1982 mat = cusp->matTranspose; 1983 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1984 } 1985 m = A->cmap->n; 1986 n = B->cmap->n; 1987 break; 1988 case MATPRODUCT_ABt: 1989 case MATPRODUCT_RARt: 1990 mat = cusp->mat; 1991 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1992 m = A->rmap->n; 1993 n = B->rmap->n; 1994 break; 1995 default: 1996 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1997 } 1998 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1999 csrmat = (CsrMatrix*)mat->mat; 2000 /* if the user passed a CPU matrix, copy the data to the GPU */ 2001 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2002 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2003 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2004 2005 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2006 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2007 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2008 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2009 } else { 2010 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2011 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2012 } 2013 2014 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2015 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2016 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2017 /* (re)allcoate mmBuffer if not initialized or LDAs are different */ 2018 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2019 size_t mmBufferSize; 2020 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2021 if (!mmdata->matBDescr) { 2022 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2023 mmdata->Blda = blda; 2024 } 2025 2026 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2027 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2028 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2029 mmdata->Clda = clda; 2030 } 2031 2032 if (!mat->matDescr) { 2033 stat = cusparseCreateCsr(&mat->matDescr, 2034 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2035 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2036 csrmat->values->data().get(), 2037 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2038 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2039 } 2040 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2041 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2042 mmdata->matCDescr,cusparse_scalartype, 2043 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2044 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2045 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2046 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2047 mmdata->mmBufferSize = mmBufferSize; 2048 } 2049 mmdata->initialized = PETSC_TRUE; 2050 } else { 2051 /* to be safe, always update pointers of the mats */ 2052 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2053 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2054 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2055 } 2056 2057 /* do cusparseSpMM, which supports transpose on B */ 2058 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2059 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2060 mmdata->matCDescr,cusparse_scalartype, 2061 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2062 #else 2063 PetscInt k; 2064 /* cusparseXcsrmm does not support transpose on B */ 2065 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2066 cublasHandle_t cublasv2handle; 2067 cublasStatus_t cerr; 2068 2069 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2070 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2071 B->cmap->n,B->rmap->n, 2072 &PETSC_CUSPARSE_ONE ,barray,blda, 2073 &PETSC_CUSPARSE_ZERO,barray,blda, 2074 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2075 blda = B->cmap->n; 2076 k = B->cmap->n; 2077 } else { 2078 k = B->rmap->n; 2079 } 2080 2081 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2082 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2083 csrmat->num_entries,mat->alpha_one,mat->descr, 2084 csrmat->values->data().get(), 2085 csrmat->row_offsets->data().get(), 2086 csrmat->column_indices->data().get(), 2087 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2088 carray,clda);CHKERRCUSPARSE(stat); 2089 #endif 2090 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2091 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2092 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2093 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2094 if (product->type == MATPRODUCT_RARt) { 2095 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2096 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2097 } else if (product->type == MATPRODUCT_PtAP) { 2098 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2099 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2100 } else { 2101 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2102 } 2103 if (mmdata->cisdense) { 2104 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2105 } 2106 if (!biscuda) { 2107 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2108 } 2109 PetscFunctionReturn(0); 2110 } 2111 2112 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2113 { 2114 Mat_Product *product = C->product; 2115 Mat A,B; 2116 PetscInt m,n; 2117 PetscBool cisdense,flg; 2118 PetscErrorCode ierr; 2119 MatMatCusparse *mmdata; 2120 Mat_SeqAIJCUSPARSE *cusp; 2121 2122 PetscFunctionBegin; 2123 MatCheckProduct(C,1); 2124 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2125 A = product->A; 2126 B = product->B; 2127 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2128 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2129 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2130 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2131 switch (product->type) { 2132 case MATPRODUCT_AB: 2133 m = A->rmap->n; 2134 n = B->cmap->n; 2135 break; 2136 case MATPRODUCT_AtB: 2137 m = A->cmap->n; 2138 n = B->cmap->n; 2139 break; 2140 case MATPRODUCT_ABt: 2141 m = A->rmap->n; 2142 n = B->rmap->n; 2143 break; 2144 case MATPRODUCT_PtAP: 2145 m = B->cmap->n; 2146 n = B->cmap->n; 2147 break; 2148 case MATPRODUCT_RARt: 2149 m = B->rmap->n; 2150 n = B->rmap->n; 2151 break; 2152 default: 2153 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2154 } 2155 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2156 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2157 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2158 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2159 2160 /* product data */ 2161 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2162 mmdata->cisdense = cisdense; 2163 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2164 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2165 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2166 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2167 } 2168 #endif 2169 /* for these products we need intermediate storage */ 2170 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2171 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2172 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2173 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2174 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2175 } else { 2176 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2177 } 2178 } 2179 C->product->data = mmdata; 2180 C->product->destroy = MatDestroy_MatMatCusparse; 2181 2182 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2183 PetscFunctionReturn(0); 2184 } 2185 2186 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2187 { 2188 Mat_Product *product = C->product; 2189 Mat A,B; 2190 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2191 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2192 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2193 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2194 PetscBool flg; 2195 PetscErrorCode ierr; 2196 cusparseStatus_t stat; 2197 cudaError_t cerr; 2198 MatProductType ptype; 2199 MatMatCusparse *mmdata; 2200 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2201 cusparseSpMatDescr_t BmatSpDescr; 2202 #endif 2203 2204 PetscFunctionBegin; 2205 MatCheckProduct(C,1); 2206 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 2207 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2208 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name); 2209 mmdata = (MatMatCusparse*)C->product->data; 2210 A = product->A; 2211 B = product->B; 2212 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2213 mmdata->reusesym = PETSC_FALSE; 2214 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2215 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2216 Cmat = Ccusp->mat; 2217 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2218 Ccsr = (CsrMatrix*)Cmat->mat; 2219 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2220 goto finalize; 2221 } 2222 if (!c->nz) goto finalize; 2223 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2224 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2225 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2226 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2227 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2228 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2229 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2230 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2231 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2232 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2233 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2234 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2235 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2236 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2237 2238 ptype = product->type; 2239 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2240 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2241 switch (ptype) { 2242 case MATPRODUCT_AB: 2243 Amat = Acusp->mat; 2244 Bmat = Bcusp->mat; 2245 break; 2246 case MATPRODUCT_AtB: 2247 Amat = Acusp->matTranspose; 2248 Bmat = Bcusp->mat; 2249 break; 2250 case MATPRODUCT_ABt: 2251 Amat = Acusp->mat; 2252 Bmat = Bcusp->matTranspose; 2253 break; 2254 default: 2255 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2256 } 2257 Cmat = Ccusp->mat; 2258 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2259 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2260 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2261 Acsr = (CsrMatrix*)Amat->mat; 2262 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2263 Ccsr = (CsrMatrix*)Cmat->mat; 2264 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2265 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2266 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2267 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2268 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2269 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2270 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2271 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2272 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2273 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2274 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2275 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2276 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2277 #else 2278 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2279 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2280 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2281 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2282 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2283 #endif 2284 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2285 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2286 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2287 C->offloadmask = PETSC_OFFLOAD_GPU; 2288 finalize: 2289 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2290 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2291 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2292 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2293 c->reallocs = 0; 2294 C->info.mallocs += 0; 2295 C->info.nz_unneeded = 0; 2296 C->assembled = C->was_assembled = PETSC_TRUE; 2297 C->num_ass++; 2298 PetscFunctionReturn(0); 2299 } 2300 2301 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2302 { 2303 Mat_Product *product = C->product; 2304 Mat A,B; 2305 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2306 Mat_SeqAIJ *a,*b,*c; 2307 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2308 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2309 PetscInt i,j,m,n,k; 2310 PetscBool flg; 2311 PetscErrorCode ierr; 2312 cusparseStatus_t stat; 2313 cudaError_t cerr; 2314 MatProductType ptype; 2315 MatMatCusparse *mmdata; 2316 PetscLogDouble flops; 2317 PetscBool biscompressed,ciscompressed; 2318 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2319 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2320 size_t bufSize2; 2321 cusparseSpMatDescr_t BmatSpDescr; 2322 #else 2323 int cnz; 2324 #endif 2325 2326 PetscFunctionBegin; 2327 MatCheckProduct(C,1); 2328 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2329 A = product->A; 2330 B = product->B; 2331 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2332 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2333 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2334 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2335 a = (Mat_SeqAIJ*)A->data; 2336 b = (Mat_SeqAIJ*)B->data; 2337 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2338 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2339 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2340 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2341 2342 /* product data */ 2343 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2344 C->product->data = mmdata; 2345 C->product->destroy = MatDestroy_MatMatCusparse; 2346 2347 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2348 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2349 ptype = product->type; 2350 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2351 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2352 biscompressed = PETSC_FALSE; 2353 ciscompressed = PETSC_FALSE; 2354 switch (ptype) { 2355 case MATPRODUCT_AB: 2356 m = A->rmap->n; 2357 n = B->cmap->n; 2358 k = A->cmap->n; 2359 Amat = Acusp->mat; 2360 Bmat = Bcusp->mat; 2361 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2362 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2363 break; 2364 case MATPRODUCT_AtB: 2365 m = A->cmap->n; 2366 n = B->cmap->n; 2367 k = A->rmap->n; 2368 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 2369 Amat = Acusp->matTranspose; 2370 Bmat = Bcusp->mat; 2371 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2372 break; 2373 case MATPRODUCT_ABt: 2374 m = A->rmap->n; 2375 n = B->rmap->n; 2376 k = A->cmap->n; 2377 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr); 2378 Amat = Acusp->mat; 2379 Bmat = Bcusp->matTranspose; 2380 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2381 break; 2382 default: 2383 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2384 } 2385 2386 /* create cusparse matrix */ 2387 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2388 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2389 c = (Mat_SeqAIJ*)C->data; 2390 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2391 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2392 Ccsr = new CsrMatrix; 2393 2394 c->compressedrow.use = ciscompressed; 2395 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2396 c->compressedrow.nrows = a->compressedrow.nrows; 2397 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2398 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2399 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2400 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2401 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2402 } else { 2403 c->compressedrow.nrows = 0; 2404 c->compressedrow.i = NULL; 2405 c->compressedrow.rindex = NULL; 2406 Ccusp->workVector = NULL; 2407 Cmat->cprowIndices = NULL; 2408 } 2409 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2410 Ccusp->mat = Cmat; 2411 Ccusp->mat->mat = Ccsr; 2412 Ccsr->num_rows = Ccusp->nrows; 2413 Ccsr->num_cols = n; 2414 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2415 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2416 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2417 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2418 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2419 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2420 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2421 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2422 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2423 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2424 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2425 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2426 c->nz = 0; 2427 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2428 Ccsr->values = new THRUSTARRAY(c->nz); 2429 goto finalizesym; 2430 } 2431 2432 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2433 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2434 Acsr = (CsrMatrix*)Amat->mat; 2435 if (!biscompressed) { 2436 Bcsr = (CsrMatrix*)Bmat->mat; 2437 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2438 BmatSpDescr = Bmat->matDescr; 2439 #endif 2440 } else { /* we need to use row offsets for the full matrix */ 2441 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2442 Bcsr = new CsrMatrix; 2443 Bcsr->num_rows = B->rmap->n; 2444 Bcsr->num_cols = cBcsr->num_cols; 2445 Bcsr->num_entries = cBcsr->num_entries; 2446 Bcsr->column_indices = cBcsr->column_indices; 2447 Bcsr->values = cBcsr->values; 2448 if (!Bcusp->rowoffsets_gpu) { 2449 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2450 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2451 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2452 } 2453 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2454 mmdata->Bcsr = Bcsr; 2455 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2456 if (Bcsr->num_rows && Bcsr->num_cols) { 2457 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2458 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2459 Bcsr->values->data().get(), 2460 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2461 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2462 } 2463 BmatSpDescr = mmdata->matSpBDescr; 2464 #endif 2465 } 2466 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2467 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2468 /* precompute flops count */ 2469 if (ptype == MATPRODUCT_AB) { 2470 for (i=0, flops = 0; i<A->rmap->n; i++) { 2471 const PetscInt st = a->i[i]; 2472 const PetscInt en = a->i[i+1]; 2473 for (j=st; j<en; j++) { 2474 const PetscInt brow = a->j[j]; 2475 flops += 2.*(b->i[brow+1] - b->i[brow]); 2476 } 2477 } 2478 } else if (ptype == MATPRODUCT_AtB) { 2479 for (i=0, flops = 0; i<A->rmap->n; i++) { 2480 const PetscInt anzi = a->i[i+1] - a->i[i]; 2481 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2482 flops += (2.*anzi)*bnzi; 2483 } 2484 } else { /* TODO */ 2485 flops = 0.; 2486 } 2487 2488 mmdata->flops = flops; 2489 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2490 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2491 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2492 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2493 NULL, NULL, NULL, 2494 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2495 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2496 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2497 /* ask bufferSize bytes for external memory */ 2498 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2499 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2500 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2501 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2502 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2503 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2504 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2505 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2506 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2507 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2508 /* ask bufferSize again bytes for external memory */ 2509 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2510 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2511 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2512 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2513 /* The CUSPARSE documentation is not clear, nor the API 2514 We need both buffers to perform the operations properly! 2515 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2516 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2517 is stored in the descriptor! What a messy API... */ 2518 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2519 /* compute the intermediate product of A * B */ 2520 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2521 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2522 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2523 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2524 /* get matrix C non-zero entries C_nnz1 */ 2525 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2526 c->nz = (PetscInt) C_nnz1; 2527 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2528 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2529 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2530 Ccsr->values = new THRUSTARRAY(c->nz); 2531 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2532 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2533 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2534 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2535 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2536 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2537 #else 2538 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2539 stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2540 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2541 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2542 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2543 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2544 c->nz = cnz; 2545 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2546 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2547 Ccsr->values = new THRUSTARRAY(c->nz); 2548 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2549 2550 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2551 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2552 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2553 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2554 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2555 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2556 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2557 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2558 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2559 #endif 2560 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2561 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2562 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2563 finalizesym: 2564 c->singlemalloc = PETSC_FALSE; 2565 c->free_a = PETSC_TRUE; 2566 c->free_ij = PETSC_TRUE; 2567 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2568 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2569 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2570 PetscInt *d_i = c->i; 2571 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2572 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2573 ii = *Ccsr->row_offsets; 2574 jj = *Ccsr->column_indices; 2575 if (ciscompressed) d_i = c->compressedrow.i; 2576 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2577 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2578 } else { 2579 PetscInt *d_i = c->i; 2580 if (ciscompressed) d_i = c->compressedrow.i; 2581 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2582 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2583 } 2584 if (ciscompressed) { /* need to expand host row offsets */ 2585 PetscInt r = 0; 2586 c->i[0] = 0; 2587 for (k = 0; k < c->compressedrow.nrows; k++) { 2588 const PetscInt next = c->compressedrow.rindex[k]; 2589 const PetscInt old = c->compressedrow.i[k]; 2590 for (; r < next; r++) c->i[r+1] = old; 2591 } 2592 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2593 } 2594 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2595 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2596 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2597 c->maxnz = c->nz; 2598 c->nonzerorowcnt = 0; 2599 c->rmax = 0; 2600 for (k = 0; k < m; k++) { 2601 const PetscInt nn = c->i[k+1] - c->i[k]; 2602 c->ilen[k] = c->imax[k] = nn; 2603 c->nonzerorowcnt += (PetscInt)!!nn; 2604 c->rmax = PetscMax(c->rmax,nn); 2605 } 2606 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2607 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2608 Ccsr->num_entries = c->nz; 2609 2610 C->nonzerostate++; 2611 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2612 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2613 Ccusp->nonzerostate = C->nonzerostate; 2614 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2615 C->preallocated = PETSC_TRUE; 2616 C->assembled = PETSC_FALSE; 2617 C->was_assembled = PETSC_FALSE; 2618 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2619 mmdata->reusesym = PETSC_TRUE; 2620 C->offloadmask = PETSC_OFFLOAD_GPU; 2621 } 2622 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2623 PetscFunctionReturn(0); 2624 } 2625 2626 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2627 2628 /* handles sparse or dense B */ 2629 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2630 { 2631 Mat_Product *product = mat->product; 2632 PetscErrorCode ierr; 2633 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2634 2635 PetscFunctionBegin; 2636 MatCheckProduct(mat,1); 2637 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2638 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2639 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2640 } 2641 if (product->type == MATPRODUCT_ABC) { 2642 Ciscusp = PETSC_FALSE; 2643 if (!product->C->boundtocpu) { 2644 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2645 } 2646 } 2647 if (isdense) { 2648 switch (product->type) { 2649 case MATPRODUCT_AB: 2650 case MATPRODUCT_AtB: 2651 case MATPRODUCT_ABt: 2652 case MATPRODUCT_PtAP: 2653 case MATPRODUCT_RARt: 2654 if (product->A->boundtocpu) { 2655 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2656 } else { 2657 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2658 } 2659 break; 2660 case MATPRODUCT_ABC: 2661 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2662 break; 2663 default: 2664 break; 2665 } 2666 } else if (Biscusp && Ciscusp) { 2667 switch (product->type) { 2668 case MATPRODUCT_AB: 2669 case MATPRODUCT_AtB: 2670 case MATPRODUCT_ABt: 2671 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2672 break; 2673 case MATPRODUCT_PtAP: 2674 case MATPRODUCT_RARt: 2675 case MATPRODUCT_ABC: 2676 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2677 break; 2678 default: 2679 break; 2680 } 2681 } else { /* fallback for AIJ */ 2682 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2683 } 2684 PetscFunctionReturn(0); 2685 } 2686 2687 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2688 { 2689 PetscErrorCode ierr; 2690 2691 PetscFunctionBegin; 2692 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2693 PetscFunctionReturn(0); 2694 } 2695 2696 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2697 { 2698 PetscErrorCode ierr; 2699 2700 PetscFunctionBegin; 2701 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2702 PetscFunctionReturn(0); 2703 } 2704 2705 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2706 { 2707 PetscErrorCode ierr; 2708 2709 PetscFunctionBegin; 2710 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2711 PetscFunctionReturn(0); 2712 } 2713 2714 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2715 { 2716 PetscErrorCode ierr; 2717 2718 PetscFunctionBegin; 2719 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2720 PetscFunctionReturn(0); 2721 } 2722 2723 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2724 { 2725 PetscErrorCode ierr; 2726 2727 PetscFunctionBegin; 2728 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2729 PetscFunctionReturn(0); 2730 } 2731 2732 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2733 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2734 { 2735 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2736 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2737 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2738 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2739 PetscErrorCode ierr; 2740 cudaError_t cerr; 2741 cusparseStatus_t stat; 2742 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2743 PetscBool compressed; 2744 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2745 PetscInt nx,ny; 2746 #endif 2747 2748 PetscFunctionBegin; 2749 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2750 if (!a->nonzerorowcnt) { 2751 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2752 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2753 PetscFunctionReturn(0); 2754 } 2755 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2756 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2757 if (!trans) { 2758 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2759 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2760 } else { 2761 if (herm || !cusparsestruct->transgen) { 2762 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2763 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2764 } else { 2765 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);} 2766 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2767 } 2768 } 2769 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2770 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2771 2772 try { 2773 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2774 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2775 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2776 2777 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2778 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2779 /* z = A x + beta y. 2780 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2781 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2782 */ 2783 xptr = xarray; 2784 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2785 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2786 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2787 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2788 allocated to accommodate different uses. So we get the length info directly from mat. 2789 */ 2790 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2791 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2792 nx = mat->num_cols; 2793 ny = mat->num_rows; 2794 } 2795 #endif 2796 } else { 2797 /* z = A^T x + beta y 2798 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2799 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2800 */ 2801 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2802 dptr = zarray; 2803 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2804 if (compressed) { /* Scatter x to work vector */ 2805 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2806 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2807 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2808 VecCUDAEqualsReverse()); 2809 } 2810 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2811 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2812 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2813 nx = mat->num_rows; 2814 ny = mat->num_cols; 2815 } 2816 #endif 2817 } 2818 2819 /* csr_spmv does y = alpha op(A) x + beta y */ 2820 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2821 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2822 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2823 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2824 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2825 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2826 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2827 matstruct->matDescr, 2828 matstruct->cuSpMV[opA].vecXDescr, beta, 2829 matstruct->cuSpMV[opA].vecYDescr, 2830 cusparse_scalartype, 2831 cusparsestruct->spmvAlg, 2832 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2833 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2834 2835 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2836 } else { 2837 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2838 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2839 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2840 } 2841 2842 stat = cusparseSpMV(cusparsestruct->handle, opA, 2843 matstruct->alpha_one, 2844 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */ 2845 matstruct->cuSpMV[opA].vecXDescr, 2846 beta, 2847 matstruct->cuSpMV[opA].vecYDescr, 2848 cusparse_scalartype, 2849 cusparsestruct->spmvAlg, 2850 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2851 #else 2852 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2853 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2854 mat->num_rows, mat->num_cols, 2855 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2856 mat->values->data().get(), mat->row_offsets->data().get(), 2857 mat->column_indices->data().get(), xptr, beta, 2858 dptr);CHKERRCUSPARSE(stat); 2859 #endif 2860 } else { 2861 if (cusparsestruct->nrows) { 2862 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2863 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2864 #else 2865 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2866 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2867 matstruct->alpha_one, matstruct->descr, hybMat, 2868 xptr, beta, 2869 dptr);CHKERRCUSPARSE(stat); 2870 #endif 2871 } 2872 } 2873 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2874 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2875 2876 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2877 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2878 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2879 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2880 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2881 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2882 } 2883 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2884 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2885 } 2886 2887 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2888 if (compressed) { 2889 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2890 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2891 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2892 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2893 VecCUDAPlusEquals()); 2894 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2895 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2896 } 2897 } else { 2898 if (yy && yy != zz) { 2899 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2900 } 2901 } 2902 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2903 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2904 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2905 } catch(char *ex) { 2906 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2907 } 2908 if (yy) { 2909 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2910 } else { 2911 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2912 } 2913 PetscFunctionReturn(0); 2914 } 2915 2916 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2917 { 2918 PetscErrorCode ierr; 2919 2920 PetscFunctionBegin; 2921 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2922 PetscFunctionReturn(0); 2923 } 2924 2925 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2926 { 2927 PetscErrorCode ierr; 2928 PetscSplitCSRDataStructure *d_mat = NULL; 2929 PetscFunctionBegin; 2930 if (A->factortype == MAT_FACTOR_NONE) { 2931 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2932 } 2933 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2934 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2935 if (d_mat) { 2936 A->offloadmask = PETSC_OFFLOAD_GPU; 2937 } 2938 2939 PetscFunctionReturn(0); 2940 } 2941 2942 /* --------------------------------------------------------------------------------*/ 2943 /*@ 2944 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2945 (the default parallel PETSc format). This matrix will ultimately pushed down 2946 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2947 assembly performance the user should preallocate the matrix storage by setting 2948 the parameter nz (or the array nnz). By setting these parameters accurately, 2949 performance during matrix assembly can be increased by more than a factor of 50. 2950 2951 Collective 2952 2953 Input Parameters: 2954 + comm - MPI communicator, set to PETSC_COMM_SELF 2955 . m - number of rows 2956 . n - number of columns 2957 . nz - number of nonzeros per row (same for all rows) 2958 - nnz - array containing the number of nonzeros in the various rows 2959 (possibly different for each row) or NULL 2960 2961 Output Parameter: 2962 . A - the matrix 2963 2964 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2965 MatXXXXSetPreallocation() paradgm instead of this routine directly. 2966 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 2967 2968 Notes: 2969 If nnz is given then nz is ignored 2970 2971 The AIJ format (also called the Yale sparse matrix format or 2972 compressed row storage), is fully compatible with standard Fortran 77 2973 storage. That is, the stored row and column indices can begin at 2974 either one (as in Fortran) or zero. See the users' manual for details. 2975 2976 Specify the preallocated storage with either nz or nnz (not both). 2977 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 2978 allocation. For large problems you MUST preallocate memory or you 2979 will get TERRIBLE performance, see the users' manual chapter on matrices. 2980 2981 By default, this format uses inodes (identical nodes) when possible, to 2982 improve numerical efficiency of matrix-vector products and solves. We 2983 search for consecutive rows with the same nonzero structure, thereby 2984 reusing matrix information to achieve increased efficiency. 2985 2986 Level: intermediate 2987 2988 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 2989 @*/ 2990 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 2991 { 2992 PetscErrorCode ierr; 2993 2994 PetscFunctionBegin; 2995 ierr = MatCreate(comm,A);CHKERRQ(ierr); 2996 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 2997 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2998 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 2999 PetscFunctionReturn(0); 3000 } 3001 3002 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3003 { 3004 PetscErrorCode ierr; 3005 PetscSplitCSRDataStructure *d_mat = NULL; 3006 3007 PetscFunctionBegin; 3008 if (A->factortype == MAT_FACTOR_NONE) { 3009 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 3010 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 3011 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3012 } else { 3013 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3014 } 3015 if (d_mat) { 3016 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3017 cudaError_t err; 3018 PetscSplitCSRDataStructure h_mat; 3019 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 3020 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 3021 if (a->compressedrow.use) { 3022 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 3023 } 3024 err = cudaFree(d_mat);CHKERRCUDA(err); 3025 } 3026 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3027 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3028 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3029 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3030 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3031 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3032 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3033 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3034 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3035 PetscFunctionReturn(0); 3036 } 3037 3038 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3039 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3040 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3041 { 3042 PetscErrorCode ierr; 3043 3044 PetscFunctionBegin; 3045 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3046 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3047 PetscFunctionReturn(0); 3048 } 3049 3050 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3051 { 3052 PetscErrorCode ierr; 3053 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3054 Mat_SeqAIJCUSPARSE *cy; 3055 Mat_SeqAIJCUSPARSE *cx; 3056 PetscScalar *ay; 3057 const PetscScalar *ax; 3058 CsrMatrix *csry,*csrx; 3059 cudaError_t cerr; 3060 3061 PetscFunctionBegin; 3062 if (X->ops->axpy != Y->ops->axpy) { 3063 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3064 PetscFunctionReturn(0); 3065 } 3066 /* if we are here, it means both matrices are bound to GPU */ 3067 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3068 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3069 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3070 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3071 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3072 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3073 csry = (CsrMatrix*)cy->mat->mat; 3074 csrx = (CsrMatrix*)cx->mat->mat; 3075 /* see if we can turn this into a cublas axpy */ 3076 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3077 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3078 if (eq) { 3079 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3080 } 3081 if (eq) str = SAME_NONZERO_PATTERN; 3082 } 3083 3084 if (str == SUBSET_NONZERO_PATTERN) { 3085 cusparseStatus_t stat; 3086 PetscScalar b = 1.0; 3087 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3088 size_t bufferSize; 3089 void *buffer; 3090 #endif 3091 3092 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3093 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3094 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3095 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3096 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3097 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3098 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3099 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3100 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3101 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3102 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3103 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3104 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3105 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3106 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3107 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3108 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3109 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3110 #else 3111 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3112 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3113 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3114 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3115 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3116 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3117 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3118 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3119 #endif 3120 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3121 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3122 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3123 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3124 } else if (str == SAME_NONZERO_PATTERN) { 3125 cublasHandle_t cublasv2handle; 3126 cublasStatus_t berr; 3127 PetscBLASInt one = 1, bnz = 1; 3128 3129 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3130 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3131 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3132 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3133 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3134 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3135 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3136 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3137 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3138 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3139 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3140 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3141 } else { 3142 ierr = MatAXPY_SeqAIJ(Y,a,X,DIFFERENT_NONZERO_PATTERN);CHKERRQ(ierr); 3143 } 3144 PetscFunctionReturn(0); 3145 } 3146 3147 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3148 { 3149 PetscErrorCode ierr; 3150 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3151 PetscScalar *ay; 3152 cudaError_t cerr; 3153 cublasHandle_t cublasv2handle; 3154 cublasStatus_t berr; 3155 PetscBLASInt one = 1, bnz = 1; 3156 3157 PetscFunctionBegin; 3158 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3159 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3160 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3161 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3162 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3163 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3164 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3165 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3166 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3167 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3168 PetscFunctionReturn(0); 3169 } 3170 3171 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3172 { 3173 PetscErrorCode ierr; 3174 PetscBool both = PETSC_FALSE; 3175 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3176 3177 PetscFunctionBegin; 3178 if (A->factortype == MAT_FACTOR_NONE) { 3179 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3180 if (spptr->mat) { 3181 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3182 if (matrix->values) { 3183 both = PETSC_TRUE; 3184 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3185 } 3186 } 3187 if (spptr->matTranspose) { 3188 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3189 if (matrix->values) { 3190 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3191 } 3192 } 3193 } 3194 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3195 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3196 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3197 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3198 else A->offloadmask = PETSC_OFFLOAD_CPU; 3199 3200 PetscFunctionReturn(0); 3201 } 3202 3203 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3204 { 3205 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3206 PetscErrorCode ierr; 3207 3208 PetscFunctionBegin; 3209 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3210 if (flg) { 3211 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3212 3213 A->ops->scale = MatScale_SeqAIJ; 3214 A->ops->axpy = MatAXPY_SeqAIJ; 3215 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3216 A->ops->mult = MatMult_SeqAIJ; 3217 A->ops->multadd = MatMultAdd_SeqAIJ; 3218 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3219 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3220 A->ops->multhermitiantranspose = NULL; 3221 A->ops->multhermitiantransposeadd = NULL; 3222 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3223 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3224 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3225 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3226 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3227 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3228 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3229 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3230 } else { 3231 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3232 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3233 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3234 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3235 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3236 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3237 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3238 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3239 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3240 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3241 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3242 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3243 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3244 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3245 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3246 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3247 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3248 } 3249 A->boundtocpu = flg; 3250 a->inode.use = flg; 3251 PetscFunctionReturn(0); 3252 } 3253 3254 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3255 { 3256 PetscErrorCode ierr; 3257 cusparseStatus_t stat; 3258 Mat B; 3259 3260 PetscFunctionBegin; 3261 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3262 if (reuse == MAT_INITIAL_MATRIX) { 3263 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3264 } else if (reuse == MAT_REUSE_MATRIX) { 3265 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3266 } 3267 B = *newmat; 3268 3269 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3270 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3271 3272 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3273 if (B->factortype == MAT_FACTOR_NONE) { 3274 Mat_SeqAIJCUSPARSE *spptr; 3275 3276 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3277 spptr->format = MAT_CUSPARSE_CSR; 3278 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3279 B->spptr = spptr; 3280 spptr->deviceMat = NULL; 3281 } else { 3282 Mat_SeqAIJCUSPARSETriFactors *spptr; 3283 3284 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3285 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3286 B->spptr = spptr; 3287 } 3288 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3289 } 3290 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3291 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3292 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3293 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3294 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3295 3296 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3297 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3298 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3299 PetscFunctionReturn(0); 3300 } 3301 3302 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3303 { 3304 PetscErrorCode ierr; 3305 3306 PetscFunctionBegin; 3307 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3308 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3309 ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr); 3310 ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr); 3311 ierr = PetscOptionsEnd();CHKERRQ(ierr); 3312 PetscFunctionReturn(0); 3313 } 3314 3315 /*MC 3316 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3317 3318 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3319 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3320 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3321 3322 Options Database Keys: 3323 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3324 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3325 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3326 3327 Level: beginner 3328 3329 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3330 M*/ 3331 3332 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 3333 3334 3335 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3336 { 3337 PetscErrorCode ierr; 3338 3339 PetscFunctionBegin; 3340 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3341 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3342 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3343 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3344 PetscFunctionReturn(0); 3345 } 3346 3347 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3348 { 3349 PetscErrorCode ierr; 3350 cusparseStatus_t stat; 3351 3352 PetscFunctionBegin; 3353 if (*cusparsestruct) { 3354 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3355 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3356 delete (*cusparsestruct)->workVector; 3357 delete (*cusparsestruct)->rowoffsets_gpu; 3358 delete (*cusparsestruct)->cooPerm; 3359 delete (*cusparsestruct)->cooPerm_a; 3360 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3361 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3362 cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr); 3363 #endif 3364 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3365 } 3366 PetscFunctionReturn(0); 3367 } 3368 3369 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3370 { 3371 PetscFunctionBegin; 3372 if (*mat) { 3373 delete (*mat)->values; 3374 delete (*mat)->column_indices; 3375 delete (*mat)->row_offsets; 3376 delete *mat; 3377 *mat = 0; 3378 } 3379 PetscFunctionReturn(0); 3380 } 3381 3382 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3383 { 3384 cusparseStatus_t stat; 3385 PetscErrorCode ierr; 3386 3387 PetscFunctionBegin; 3388 if (*trifactor) { 3389 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3390 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3391 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3392 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3393 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3394 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3395 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3396 #endif 3397 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3398 } 3399 PetscFunctionReturn(0); 3400 } 3401 3402 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3403 { 3404 CsrMatrix *mat; 3405 cusparseStatus_t stat; 3406 cudaError_t err; 3407 3408 PetscFunctionBegin; 3409 if (*matstruct) { 3410 if ((*matstruct)->mat) { 3411 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3412 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3413 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3414 #else 3415 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3416 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3417 #endif 3418 } else { 3419 mat = (CsrMatrix*)(*matstruct)->mat; 3420 CsrMatrix_Destroy(&mat); 3421 } 3422 } 3423 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3424 delete (*matstruct)->cprowIndices; 3425 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3426 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3427 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3428 3429 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3430 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3431 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3432 for (int i=0; i<3; i++) { 3433 if (mdata->cuSpMV[i].initialized) { 3434 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3435 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3436 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3437 } 3438 } 3439 #endif 3440 delete *matstruct; 3441 *matstruct = NULL; 3442 } 3443 PetscFunctionReturn(0); 3444 } 3445 3446 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3447 { 3448 PetscErrorCode ierr; 3449 3450 PetscFunctionBegin; 3451 if (*trifactors) { 3452 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3453 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3454 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3455 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3456 delete (*trifactors)->rpermIndices; 3457 delete (*trifactors)->cpermIndices; 3458 delete (*trifactors)->workVector; 3459 (*trifactors)->rpermIndices = NULL; 3460 (*trifactors)->cpermIndices = NULL; 3461 (*trifactors)->workVector = NULL; 3462 } 3463 PetscFunctionReturn(0); 3464 } 3465 3466 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3467 { 3468 PetscErrorCode ierr; 3469 cusparseHandle_t handle; 3470 cusparseStatus_t stat; 3471 3472 PetscFunctionBegin; 3473 if (*trifactors) { 3474 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3475 if (handle = (*trifactors)->handle) { 3476 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3477 } 3478 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3479 } 3480 PetscFunctionReturn(0); 3481 } 3482 3483 struct IJCompare 3484 { 3485 __host__ __device__ 3486 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3487 { 3488 if (t1.get<0>() < t2.get<0>()) return true; 3489 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3490 return false; 3491 } 3492 }; 3493 3494 struct IJEqual 3495 { 3496 __host__ __device__ 3497 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3498 { 3499 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3500 return true; 3501 } 3502 }; 3503 3504 struct IJDiff 3505 { 3506 __host__ __device__ 3507 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3508 { 3509 return t1 == t2 ? 0 : 1; 3510 } 3511 }; 3512 3513 struct IJSum 3514 { 3515 __host__ __device__ 3516 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3517 { 3518 return t1||t2; 3519 } 3520 }; 3521 3522 #include <thrust/iterator/discard_iterator.h> 3523 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3524 { 3525 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3526 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3527 THRUSTARRAY *cooPerm_v = NULL; 3528 thrust::device_ptr<const PetscScalar> d_v; 3529 CsrMatrix *matrix; 3530 PetscErrorCode ierr; 3531 cudaError_t cerr; 3532 PetscInt n; 3533 3534 PetscFunctionBegin; 3535 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3536 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3537 if (!cusp->cooPerm) { 3538 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3539 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3540 PetscFunctionReturn(0); 3541 } 3542 matrix = (CsrMatrix*)cusp->mat->mat; 3543 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3544 if (!v) { 3545 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3546 goto finalize; 3547 } 3548 n = cusp->cooPerm->size(); 3549 if (isCudaMem(v)) { 3550 d_v = thrust::device_pointer_cast(v); 3551 } else { 3552 cooPerm_v = new THRUSTARRAY(n); 3553 cooPerm_v->assign(v,v+n); 3554 d_v = cooPerm_v->data(); 3555 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3556 } 3557 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3558 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3559 if (cusp->cooPerm_a) { 3560 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3561 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3562 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3563 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3564 delete cooPerm_w; 3565 } else { 3566 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3567 matrix->values->begin())); 3568 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3569 matrix->values->end())); 3570 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 3571 } 3572 } else { 3573 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3574 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3575 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3576 } else { 3577 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3578 matrix->values->begin())); 3579 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3580 matrix->values->end())); 3581 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3582 } 3583 } 3584 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3585 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3586 finalize: 3587 delete cooPerm_v; 3588 A->offloadmask = PETSC_OFFLOAD_GPU; 3589 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3590 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3591 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3592 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3593 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3594 a->reallocs = 0; 3595 A->info.mallocs += 0; 3596 A->info.nz_unneeded = 0; 3597 A->assembled = A->was_assembled = PETSC_TRUE; 3598 A->num_ass++; 3599 PetscFunctionReturn(0); 3600 } 3601 3602 #include <thrust/binary_search.h> 3603 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3604 { 3605 PetscErrorCode ierr; 3606 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3607 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3608 PetscInt cooPerm_n, nzr = 0; 3609 cudaError_t cerr; 3610 3611 PetscFunctionBegin; 3612 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3613 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3614 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3615 if (n != cooPerm_n) { 3616 delete cusp->cooPerm; 3617 delete cusp->cooPerm_a; 3618 cusp->cooPerm = NULL; 3619 cusp->cooPerm_a = NULL; 3620 } 3621 if (n) { 3622 THRUSTINTARRAY d_i(n); 3623 THRUSTINTARRAY d_j(n); 3624 THRUSTINTARRAY ii(A->rmap->n); 3625 3626 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3627 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3628 3629 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3630 d_i.assign(coo_i,coo_i+n); 3631 d_j.assign(coo_j,coo_j+n); 3632 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3633 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3634 3635 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3636 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3637 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 3638 *cusp->cooPerm_a = d_i; 3639 THRUSTINTARRAY w = d_j; 3640 3641 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 3642 if (nekey == ekey) { /* all entries are unique */ 3643 delete cusp->cooPerm_a; 3644 cusp->cooPerm_a = NULL; 3645 } else { /* I couldn't come up with a more elegant algorithm */ 3646 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 3647 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 3648 (*cusp->cooPerm_a)[0] = 0; 3649 w[0] = 0; 3650 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 3651 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 3652 } 3653 thrust::counting_iterator<PetscInt> search_begin(0); 3654 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 3655 search_begin, search_begin + A->rmap->n, 3656 ii.begin()); 3657 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3658 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3659 3660 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3661 a->singlemalloc = PETSC_FALSE; 3662 a->free_a = PETSC_TRUE; 3663 a->free_ij = PETSC_TRUE; 3664 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3665 a->i[0] = 0; 3666 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3667 a->nz = a->maxnz = a->i[A->rmap->n]; 3668 a->rmax = 0; 3669 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3670 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3671 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3672 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3673 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3674 for (PetscInt i = 0; i < A->rmap->n; i++) { 3675 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3676 nzr += (PetscInt)!!(nnzr); 3677 a->ilen[i] = a->imax[i] = nnzr; 3678 a->rmax = PetscMax(a->rmax,nnzr); 3679 } 3680 a->nonzerorowcnt = nzr; 3681 A->preallocated = PETSC_TRUE; 3682 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3683 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3684 } else { 3685 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3686 } 3687 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3688 3689 /* We want to allocate the CUSPARSE struct for matvec now. 3690 The code is so convoluted now that I prefer to copy zeros */ 3691 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3692 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3693 A->offloadmask = PETSC_OFFLOAD_CPU; 3694 A->nonzerostate++; 3695 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3696 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3697 3698 A->assembled = PETSC_FALSE; 3699 A->was_assembled = PETSC_FALSE; 3700 PetscFunctionReturn(0); 3701 } 3702 3703 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 3704 { 3705 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3706 CsrMatrix *csr; 3707 PetscErrorCode ierr; 3708 3709 PetscFunctionBegin; 3710 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3711 PetscValidPointer(a,2); 3712 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3713 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3714 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3715 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3716 csr = (CsrMatrix*)cusp->mat->mat; 3717 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3718 *a = csr->values->data().get(); 3719 PetscFunctionReturn(0); 3720 } 3721 3722 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 3723 { 3724 PetscFunctionBegin; 3725 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3726 PetscValidPointer(a,2); 3727 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3728 *a = NULL; 3729 PetscFunctionReturn(0); 3730 } 3731 3732 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 3733 { 3734 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3735 CsrMatrix *csr; 3736 PetscErrorCode ierr; 3737 3738 PetscFunctionBegin; 3739 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3740 PetscValidPointer(a,2); 3741 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3742 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3743 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3744 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3745 csr = (CsrMatrix*)cusp->mat->mat; 3746 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3747 *a = csr->values->data().get(); 3748 A->offloadmask = PETSC_OFFLOAD_GPU; 3749 PetscFunctionReturn(0); 3750 } 3751 3752 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 3753 { 3754 PetscErrorCode ierr; 3755 3756 PetscFunctionBegin; 3757 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3758 PetscValidPointer(a,2); 3759 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3760 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3761 *a = NULL; 3762 PetscFunctionReturn(0); 3763 } 3764 3765 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 3766 { 3767 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3768 CsrMatrix *csr; 3769 3770 PetscFunctionBegin; 3771 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3772 PetscValidPointer(a,2); 3773 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3774 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3775 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3776 csr = (CsrMatrix*)cusp->mat->mat; 3777 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3778 *a = csr->values->data().get(); 3779 A->offloadmask = PETSC_OFFLOAD_GPU; 3780 PetscFunctionReturn(0); 3781 } 3782 3783 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 3784 { 3785 PetscErrorCode ierr; 3786 3787 PetscFunctionBegin; 3788 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3789 PetscValidPointer(a,2); 3790 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3791 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3792 *a = NULL; 3793 PetscFunctionReturn(0); 3794 } 3795 3796 struct IJCompare4 3797 { 3798 __host__ __device__ 3799 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 3800 { 3801 if (t1.get<0>() < t2.get<0>()) return true; 3802 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3803 return false; 3804 } 3805 }; 3806 3807 struct Shift 3808 { 3809 int _shift; 3810 3811 Shift(int shift) : _shift(shift) {} 3812 __host__ __device__ 3813 inline int operator() (const int &c) 3814 { 3815 return c + _shift; 3816 } 3817 }; 3818 3819 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */ 3820 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 3821 { 3822 PetscErrorCode ierr; 3823 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 3824 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 3825 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 3826 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3827 PetscInt Annz,Bnnz; 3828 cusparseStatus_t stat; 3829 PetscInt i,m,n,zero = 0; 3830 cudaError_t cerr; 3831 3832 PetscFunctionBegin; 3833 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3834 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 3835 PetscValidPointer(C,4); 3836 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3837 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 3838 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 3839 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 3840 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3841 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3842 if (reuse == MAT_INITIAL_MATRIX) { 3843 m = A->rmap->n; 3844 n = A->cmap->n + B->cmap->n; 3845 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 3846 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 3847 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3848 c = (Mat_SeqAIJ*)(*C)->data; 3849 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3850 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3851 Ccsr = new CsrMatrix; 3852 Cmat->cprowIndices = NULL; 3853 c->compressedrow.use = PETSC_FALSE; 3854 c->compressedrow.nrows = 0; 3855 c->compressedrow.i = NULL; 3856 c->compressedrow.rindex = NULL; 3857 Ccusp->workVector = NULL; 3858 Ccusp->nrows = m; 3859 Ccusp->mat = Cmat; 3860 Ccusp->mat->mat = Ccsr; 3861 Ccsr->num_rows = m; 3862 Ccsr->num_cols = n; 3863 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 3864 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3865 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3866 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3867 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3868 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3869 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3870 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3871 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3872 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3873 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 3874 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 3875 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr); 3876 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3877 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3878 3879 Acsr = (CsrMatrix*)Acusp->mat->mat; 3880 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 3881 Annz = (PetscInt)Acsr->column_indices->size(); 3882 Bnnz = (PetscInt)Bcsr->column_indices->size(); 3883 c->nz = Annz + Bnnz; 3884 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 3885 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3886 Ccsr->values = new THRUSTARRAY(c->nz); 3887 Ccsr->num_entries = c->nz; 3888 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 3889 if (c->nz) { 3890 auto Acoo = new THRUSTINTARRAY32(Annz); 3891 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 3892 auto Ccoo = new THRUSTINTARRAY32(c->nz); 3893 THRUSTINTARRAY32 *Aroff,*Broff; 3894 3895 if (a->compressedrow.use) { /* need full row offset */ 3896 if (!Acusp->rowoffsets_gpu) { 3897 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 3898 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 3899 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3900 } 3901 Aroff = Acusp->rowoffsets_gpu; 3902 } else Aroff = Acsr->row_offsets; 3903 if (b->compressedrow.use) { /* need full row offset */ 3904 if (!Bcusp->rowoffsets_gpu) { 3905 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3906 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 3907 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3908 } 3909 Broff = Bcusp->rowoffsets_gpu; 3910 } else Broff = Bcsr->row_offsets; 3911 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3912 stat = cusparseXcsr2coo(Acusp->handle, 3913 Aroff->data().get(), 3914 Annz, 3915 m, 3916 Acoo->data().get(), 3917 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3918 stat = cusparseXcsr2coo(Bcusp->handle, 3919 Broff->data().get(), 3920 Bnnz, 3921 m, 3922 Bcoo->data().get(), 3923 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3924 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 3925 auto Aperm = thrust::make_constant_iterator(1); 3926 auto Bperm = thrust::make_constant_iterator(0); 3927 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 3928 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 3929 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 3930 #else 3931 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 3932 auto Bcib = Bcsr->column_indices->begin(); 3933 auto Bcie = Bcsr->column_indices->end(); 3934 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 3935 #endif 3936 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 3937 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 3938 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 3939 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 3940 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 3941 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 3942 auto p1 = Ccusp->cooPerm->begin(); 3943 auto p2 = Ccusp->cooPerm->begin(); 3944 thrust::advance(p2,Annz); 3945 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 3946 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 3947 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 3948 #endif 3949 auto cci = thrust::make_counting_iterator(zero); 3950 auto cce = thrust::make_counting_iterator(c->nz); 3951 #if 0 //Errors on SUMMIT cuda 11.1.0 3952 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 3953 #else 3954 auto pred = thrust::identity<int>(); 3955 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 3956 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 3957 #endif 3958 stat = cusparseXcoo2csr(Ccusp->handle, 3959 Ccoo->data().get(), 3960 c->nz, 3961 m, 3962 Ccsr->row_offsets->data().get(), 3963 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3964 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3965 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3966 delete wPerm; 3967 delete Acoo; 3968 delete Bcoo; 3969 delete Ccoo; 3970 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3971 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 3972 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 3973 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 3974 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 3975 #endif 3976 if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */ 3977 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 3978 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 3979 CsrMatrix *CcsrT = new CsrMatrix; 3980 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 3981 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 3982 3983 Ccusp->transgen = PETSC_TRUE; 3984 CmatT->cprowIndices = NULL; 3985 CmatT->mat = CcsrT; 3986 CcsrT->num_rows = n; 3987 CcsrT->num_cols = m; 3988 CcsrT->num_entries = c->nz; 3989 3990 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 3991 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 3992 CcsrT->values = new THRUSTARRAY(c->nz); 3993 3994 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3995 auto rT = CcsrT->row_offsets->begin(); 3996 if (AT) { 3997 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 3998 thrust::advance(rT,-1); 3999 } 4000 if (BT) { 4001 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4002 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4003 thrust::copy(titb,tite,rT); 4004 } 4005 auto cT = CcsrT->column_indices->begin(); 4006 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4007 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4008 auto vT = CcsrT->values->begin(); 4009 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4010 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4011 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4012 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4013 4014 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4015 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4016 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4017 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4018 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4019 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4020 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4021 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4022 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4023 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4024 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4025 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4026 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4027 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4028 #endif 4029 Ccusp->matTranspose = CmatT; 4030 } 4031 } 4032 4033 c->singlemalloc = PETSC_FALSE; 4034 c->free_a = PETSC_TRUE; 4035 c->free_ij = PETSC_TRUE; 4036 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4037 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4038 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4039 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4040 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4041 ii = *Ccsr->row_offsets; 4042 jj = *Ccsr->column_indices; 4043 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4044 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4045 } else { 4046 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4047 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4048 } 4049 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4050 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4051 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4052 c->maxnz = c->nz; 4053 c->nonzerorowcnt = 0; 4054 c->rmax = 0; 4055 for (i = 0; i < m; i++) { 4056 const PetscInt nn = c->i[i+1] - c->i[i]; 4057 c->ilen[i] = c->imax[i] = nn; 4058 c->nonzerorowcnt += (PetscInt)!!nn; 4059 c->rmax = PetscMax(c->rmax,nn); 4060 } 4061 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4062 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4063 (*C)->nonzerostate++; 4064 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4065 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4066 Ccusp->nonzerostate = (*C)->nonzerostate; 4067 (*C)->preallocated = PETSC_TRUE; 4068 } else { 4069 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4070 c = (Mat_SeqAIJ*)(*C)->data; 4071 if (c->nz) { 4072 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4073 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4074 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4075 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4076 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4077 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4078 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4079 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4080 Acsr = (CsrMatrix*)Acusp->mat->mat; 4081 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4082 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4083 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4084 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4085 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4086 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4087 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4088 auto pmid = Ccusp->cooPerm->begin(); 4089 thrust::advance(pmid,Acsr->num_entries); 4090 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4091 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4092 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4093 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4094 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4095 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4096 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4097 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4098 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4099 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4100 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4101 if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) { 4102 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4103 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4104 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4105 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4106 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4107 auto vT = CcsrT->values->begin(); 4108 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4109 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4110 } 4111 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4112 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4113 } 4114 } 4115 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4116 (*C)->assembled = PETSC_TRUE; 4117 (*C)->was_assembled = PETSC_FALSE; 4118 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4119 PetscFunctionReturn(0); 4120 } 4121 4122 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4123 { 4124 PetscErrorCode ierr; 4125 bool dmem; 4126 const PetscScalar *av; 4127 cudaError_t cerr; 4128 4129 PetscFunctionBegin; 4130 dmem = isCudaMem(v); 4131 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4132 if (n && idx) { 4133 THRUSTINTARRAY widx(n); 4134 widx.assign(idx,idx+n); 4135 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4136 4137 THRUSTARRAY *w = NULL; 4138 thrust::device_ptr<PetscScalar> dv; 4139 if (dmem) { 4140 dv = thrust::device_pointer_cast(v); 4141 } else { 4142 w = new THRUSTARRAY(n); 4143 dv = w->data(); 4144 } 4145 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4146 4147 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4148 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4149 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4150 if (w) { 4151 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4152 } 4153 delete w; 4154 } else { 4155 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4156 } 4157 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4158 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4159 PetscFunctionReturn(0); 4160 } 4161