1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 68 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 69 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 70 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 71 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 72 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 73 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 74 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 75 76 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 82 83 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 85 86 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 87 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 88 89 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 90 91 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 92 { 93 cusparseStatus_t stat; 94 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 95 96 PetscFunctionBegin; 97 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 98 cusparsestruct->stream = stream; 99 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 100 PetscFunctionReturn(0); 101 } 102 103 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 104 { 105 cusparseStatus_t stat; 106 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 107 108 PetscFunctionBegin; 109 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 110 if (cusparsestruct->handle != handle) { 111 if (cusparsestruct->handle) { 112 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 113 } 114 cusparsestruct->handle = handle; 115 } 116 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 117 PetscFunctionReturn(0); 118 } 119 120 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 121 { 122 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 123 PetscBool flg; 124 PetscErrorCode ierr; 125 126 PetscFunctionBegin; 127 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 128 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 129 if (cusparsestruct->handle) cusparsestruct->handle = 0; 130 PetscFunctionReturn(0); 131 } 132 133 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 134 { 135 PetscFunctionBegin; 136 *type = MATSOLVERCUSPARSE; 137 PetscFunctionReturn(0); 138 } 139 140 /*MC 141 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 142 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 143 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 144 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 145 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 146 algorithms are not recommended. This class does NOT support direct solver operations. 147 148 Level: beginner 149 150 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 151 M*/ 152 153 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 154 { 155 PetscErrorCode ierr; 156 PetscInt n = A->rmap->n; 157 158 PetscFunctionBegin; 159 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 160 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 161 (*B)->factortype = ftype; 162 (*B)->useordering = PETSC_TRUE; 163 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 164 165 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 166 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 167 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 168 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 169 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 170 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 171 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 172 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 173 174 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 175 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 176 PetscFunctionReturn(0); 177 } 178 179 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 180 { 181 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 182 183 PetscFunctionBegin; 184 switch (op) { 185 case MAT_CUSPARSE_MULT: 186 cusparsestruct->format = format; 187 break; 188 case MAT_CUSPARSE_ALL: 189 cusparsestruct->format = format; 190 break; 191 default: 192 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 193 } 194 PetscFunctionReturn(0); 195 } 196 197 /*@ 198 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 199 operation. Only the MatMult operation can use different GPU storage formats 200 for MPIAIJCUSPARSE matrices. 201 Not Collective 202 203 Input Parameters: 204 + A - Matrix of type SEQAIJCUSPARSE 205 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 206 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 207 208 Output Parameter: 209 210 Level: intermediate 211 212 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 213 @*/ 214 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 215 { 216 PetscErrorCode ierr; 217 218 PetscFunctionBegin; 219 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 220 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 221 PetscFunctionReturn(0); 222 } 223 224 /*@ 225 MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose 226 227 Collective on mat 228 229 Input Parameters: 230 + A - Matrix of type SEQAIJCUSPARSE 231 - transgen - the boolean flag 232 233 Level: intermediate 234 235 .seealso: MATSEQAIJCUSPARSE, MatAIJCUSPARSESetGenerateTranspose() 236 @*/ 237 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 238 { 239 PetscErrorCode ierr; 240 PetscBool flg; 241 242 PetscFunctionBegin; 243 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 244 ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 245 if (flg) { 246 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 247 248 if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 249 cusp->transgen = transgen; 250 if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 251 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 252 } 253 } 254 PetscFunctionReturn(0); 255 } 256 257 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 258 { 259 PetscErrorCode ierr; 260 MatCUSPARSEStorageFormat format; 261 PetscBool flg; 262 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 263 264 PetscFunctionBegin; 265 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 266 if (A->factortype == MAT_FACTOR_NONE) { 267 PetscBool transgen = cusparsestruct->transgen; 268 269 ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 270 if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);} 271 272 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 273 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 274 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 275 276 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 277 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 278 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 279 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 280 cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 281 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 282 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 283 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 284 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 285 286 cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 287 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 288 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 289 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 290 291 cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 292 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 293 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 294 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 295 #endif 296 } 297 ierr = PetscOptionsTail();CHKERRQ(ierr); 298 PetscFunctionReturn(0); 299 } 300 301 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 302 { 303 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 304 PetscErrorCode ierr; 305 306 PetscFunctionBegin; 307 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 308 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 309 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 310 PetscFunctionReturn(0); 311 } 312 313 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 314 { 315 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 316 PetscErrorCode ierr; 317 318 PetscFunctionBegin; 319 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 320 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 321 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 322 PetscFunctionReturn(0); 323 } 324 325 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 326 { 327 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 328 PetscErrorCode ierr; 329 330 PetscFunctionBegin; 331 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 332 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 333 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 334 PetscFunctionReturn(0); 335 } 336 337 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 338 { 339 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 340 PetscErrorCode ierr; 341 342 PetscFunctionBegin; 343 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 344 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 345 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 346 PetscFunctionReturn(0); 347 } 348 349 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 350 { 351 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 352 PetscInt n = A->rmap->n; 353 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 354 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 355 cusparseStatus_t stat; 356 const PetscInt *ai = a->i,*aj = a->j,*vi; 357 const MatScalar *aa = a->a,*v; 358 PetscInt *AiLo, *AjLo; 359 PetscInt i,nz, nzLower, offset, rowOffset; 360 PetscErrorCode ierr; 361 cudaError_t cerr; 362 363 PetscFunctionBegin; 364 if (!n) PetscFunctionReturn(0); 365 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 366 try { 367 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 368 nzLower=n+ai[n]-ai[1]; 369 if (!loTriFactor) { 370 PetscScalar *AALo; 371 372 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 373 374 /* Allocate Space for the lower triangular matrix */ 375 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 376 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 377 378 /* Fill the lower triangular matrix */ 379 AiLo[0] = (PetscInt) 0; 380 AiLo[n] = nzLower; 381 AjLo[0] = (PetscInt) 0; 382 AALo[0] = (MatScalar) 1.0; 383 v = aa; 384 vi = aj; 385 offset = 1; 386 rowOffset= 1; 387 for (i=1; i<n; i++) { 388 nz = ai[i+1] - ai[i]; 389 /* additional 1 for the term on the diagonal */ 390 AiLo[i] = rowOffset; 391 rowOffset += nz+1; 392 393 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 394 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 395 396 offset += nz; 397 AjLo[offset] = (PetscInt) i; 398 AALo[offset] = (MatScalar) 1.0; 399 offset += 1; 400 401 v += nz; 402 vi += nz; 403 } 404 405 /* allocate space for the triangular factor information */ 406 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 407 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 408 /* Create the matrix description */ 409 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 410 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 411 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 412 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 413 #else 414 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 415 #endif 416 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 417 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 418 419 /* set the operation */ 420 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 421 422 /* set the matrix */ 423 loTriFactor->csrMat = new CsrMatrix; 424 loTriFactor->csrMat->num_rows = n; 425 loTriFactor->csrMat->num_cols = n; 426 loTriFactor->csrMat->num_entries = nzLower; 427 428 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 429 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 430 431 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 432 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 433 434 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 435 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 436 437 /* Create the solve analysis information */ 438 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 439 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 440 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 441 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 442 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 443 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 444 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 445 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 446 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 447 #endif 448 449 /* perform the solve analysis */ 450 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 451 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 452 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 453 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 454 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 455 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 456 #endif 457 );CHKERRCUSPARSE(stat); 458 cerr = WaitForCUDA();CHKERRCUDA(cerr); 459 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 460 461 /* assign the pointer */ 462 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 463 loTriFactor->AA_h = AALo; 464 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 465 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 466 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 467 } else { /* update values only */ 468 if (!loTriFactor->AA_h) { 469 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 470 } 471 /* Fill the lower triangular matrix */ 472 loTriFactor->AA_h[0] = 1.0; 473 v = aa; 474 vi = aj; 475 offset = 1; 476 for (i=1; i<n; i++) { 477 nz = ai[i+1] - ai[i]; 478 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 479 offset += nz; 480 loTriFactor->AA_h[offset] = 1.0; 481 offset += 1; 482 v += nz; 483 } 484 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 485 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 486 } 487 } catch(char *ex) { 488 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 489 } 490 } 491 PetscFunctionReturn(0); 492 } 493 494 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 495 { 496 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 497 PetscInt n = A->rmap->n; 498 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 499 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 500 cusparseStatus_t stat; 501 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 502 const MatScalar *aa = a->a,*v; 503 PetscInt *AiUp, *AjUp; 504 PetscInt i,nz, nzUpper, offset; 505 PetscErrorCode ierr; 506 cudaError_t cerr; 507 508 PetscFunctionBegin; 509 if (!n) PetscFunctionReturn(0); 510 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 511 try { 512 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 513 nzUpper = adiag[0]-adiag[n]; 514 if (!upTriFactor) { 515 PetscScalar *AAUp; 516 517 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 518 519 /* Allocate Space for the upper triangular matrix */ 520 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 521 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 522 523 /* Fill the upper triangular matrix */ 524 AiUp[0]=(PetscInt) 0; 525 AiUp[n]=nzUpper; 526 offset = nzUpper; 527 for (i=n-1; i>=0; i--) { 528 v = aa + adiag[i+1] + 1; 529 vi = aj + adiag[i+1] + 1; 530 531 /* number of elements NOT on the diagonal */ 532 nz = adiag[i] - adiag[i+1]-1; 533 534 /* decrement the offset */ 535 offset -= (nz+1); 536 537 /* first, set the diagonal elements */ 538 AjUp[offset] = (PetscInt) i; 539 AAUp[offset] = (MatScalar)1./v[nz]; 540 AiUp[i] = AiUp[i+1] - (nz+1); 541 542 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 543 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 544 } 545 546 /* allocate space for the triangular factor information */ 547 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 548 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 549 550 /* Create the matrix description */ 551 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 552 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 553 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 554 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 555 #else 556 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 557 #endif 558 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 559 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 560 561 /* set the operation */ 562 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 563 564 /* set the matrix */ 565 upTriFactor->csrMat = new CsrMatrix; 566 upTriFactor->csrMat->num_rows = n; 567 upTriFactor->csrMat->num_cols = n; 568 upTriFactor->csrMat->num_entries = nzUpper; 569 570 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 571 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 572 573 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 574 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 575 576 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 577 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 578 579 /* Create the solve analysis information */ 580 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 581 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 582 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 583 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 584 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 585 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 586 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 587 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 588 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 589 #endif 590 591 /* perform the solve analysis */ 592 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 593 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 594 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 595 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 596 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 597 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 598 #endif 599 );CHKERRCUSPARSE(stat); 600 cerr = WaitForCUDA();CHKERRCUDA(cerr); 601 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 602 603 /* assign the pointer */ 604 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 605 upTriFactor->AA_h = AAUp; 606 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 607 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 608 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 609 } else { 610 if (!upTriFactor->AA_h) { 611 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 612 } 613 /* Fill the upper triangular matrix */ 614 offset = nzUpper; 615 for (i=n-1; i>=0; i--) { 616 v = aa + adiag[i+1] + 1; 617 618 /* number of elements NOT on the diagonal */ 619 nz = adiag[i] - adiag[i+1]-1; 620 621 /* decrement the offset */ 622 offset -= (nz+1); 623 624 /* first, set the diagonal elements */ 625 upTriFactor->AA_h[offset] = 1./v[nz]; 626 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 627 } 628 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 629 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 630 } 631 } catch(char *ex) { 632 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 633 } 634 } 635 PetscFunctionReturn(0); 636 } 637 638 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 639 { 640 PetscErrorCode ierr; 641 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 642 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 643 IS isrow = a->row,iscol = a->icol; 644 PetscBool row_identity,col_identity; 645 PetscInt n = A->rmap->n; 646 647 PetscFunctionBegin; 648 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 649 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 650 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 651 652 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 653 cusparseTriFactors->nnz=a->nz; 654 655 A->offloadmask = PETSC_OFFLOAD_BOTH; 656 /* lower triangular indices */ 657 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 658 if (!row_identity && !cusparseTriFactors->rpermIndices) { 659 const PetscInt *r; 660 661 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 662 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 663 cusparseTriFactors->rpermIndices->assign(r, r+n); 664 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 665 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 666 } 667 668 /* upper triangular indices */ 669 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 670 if (!col_identity && !cusparseTriFactors->cpermIndices) { 671 const PetscInt *c; 672 673 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 674 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 675 cusparseTriFactors->cpermIndices->assign(c, c+n); 676 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 677 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 678 } 679 PetscFunctionReturn(0); 680 } 681 682 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 683 { 684 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 685 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 686 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 687 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 688 cusparseStatus_t stat; 689 PetscErrorCode ierr; 690 cudaError_t cerr; 691 PetscInt *AiUp, *AjUp; 692 PetscScalar *AAUp; 693 PetscScalar *AALo; 694 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 695 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 696 const PetscInt *ai = b->i,*aj = b->j,*vj; 697 const MatScalar *aa = b->a,*v; 698 699 PetscFunctionBegin; 700 if (!n) PetscFunctionReturn(0); 701 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 702 try { 703 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 704 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 705 if (!upTriFactor && !loTriFactor) { 706 /* Allocate Space for the upper triangular matrix */ 707 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 708 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 709 710 /* Fill the upper triangular matrix */ 711 AiUp[0]=(PetscInt) 0; 712 AiUp[n]=nzUpper; 713 offset = 0; 714 for (i=0; i<n; i++) { 715 /* set the pointers */ 716 v = aa + ai[i]; 717 vj = aj + ai[i]; 718 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 719 720 /* first, set the diagonal elements */ 721 AjUp[offset] = (PetscInt) i; 722 AAUp[offset] = (MatScalar)1.0/v[nz]; 723 AiUp[i] = offset; 724 AALo[offset] = (MatScalar)1.0/v[nz]; 725 726 offset+=1; 727 if (nz>0) { 728 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 729 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 730 for (j=offset; j<offset+nz; j++) { 731 AAUp[j] = -AAUp[j]; 732 AALo[j] = AAUp[j]/v[nz]; 733 } 734 offset+=nz; 735 } 736 } 737 738 /* allocate space for the triangular factor information */ 739 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 740 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 741 742 /* Create the matrix description */ 743 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 744 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 745 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 746 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 747 #else 748 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 749 #endif 750 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 751 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 752 753 /* set the matrix */ 754 upTriFactor->csrMat = new CsrMatrix; 755 upTriFactor->csrMat->num_rows = A->rmap->n; 756 upTriFactor->csrMat->num_cols = A->cmap->n; 757 upTriFactor->csrMat->num_entries = a->nz; 758 759 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 760 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 761 762 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 763 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 764 765 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 766 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 767 768 /* set the operation */ 769 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 770 771 /* Create the solve analysis information */ 772 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 773 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 774 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 775 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 776 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 777 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 778 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 779 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 780 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 781 #endif 782 783 /* perform the solve analysis */ 784 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 785 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 786 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 787 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 788 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 789 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 790 #endif 791 );CHKERRCUSPARSE(stat); 792 cerr = WaitForCUDA();CHKERRCUDA(cerr); 793 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 794 795 /* assign the pointer */ 796 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 797 798 /* allocate space for the triangular factor information */ 799 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 800 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 801 802 /* Create the matrix description */ 803 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 804 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 805 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 806 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 807 #else 808 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 809 #endif 810 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 811 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 812 813 /* set the operation */ 814 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 815 816 /* set the matrix */ 817 loTriFactor->csrMat = new CsrMatrix; 818 loTriFactor->csrMat->num_rows = A->rmap->n; 819 loTriFactor->csrMat->num_cols = A->cmap->n; 820 loTriFactor->csrMat->num_entries = a->nz; 821 822 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 823 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 824 825 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 826 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 827 828 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 829 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 830 831 /* Create the solve analysis information */ 832 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 833 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 834 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 835 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 836 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 837 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 838 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 839 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 840 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 841 #endif 842 843 /* perform the solve analysis */ 844 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 845 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 846 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 847 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 848 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 849 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 850 #endif 851 );CHKERRCUSPARSE(stat); 852 cerr = WaitForCUDA();CHKERRCUDA(cerr); 853 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 854 855 /* assign the pointer */ 856 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 857 858 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 859 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 860 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 861 } else { 862 /* Fill the upper triangular matrix */ 863 offset = 0; 864 for (i=0; i<n; i++) { 865 /* set the pointers */ 866 v = aa + ai[i]; 867 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 868 869 /* first, set the diagonal elements */ 870 AAUp[offset] = 1.0/v[nz]; 871 AALo[offset] = 1.0/v[nz]; 872 873 offset+=1; 874 if (nz>0) { 875 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 876 for (j=offset; j<offset+nz; j++) { 877 AAUp[j] = -AAUp[j]; 878 AALo[j] = AAUp[j]/v[nz]; 879 } 880 offset+=nz; 881 } 882 } 883 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 884 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 885 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 886 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 887 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 888 } 889 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 890 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 891 } catch(char *ex) { 892 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 893 } 894 } 895 PetscFunctionReturn(0); 896 } 897 898 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 899 { 900 PetscErrorCode ierr; 901 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 902 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 903 IS ip = a->row; 904 PetscBool perm_identity; 905 PetscInt n = A->rmap->n; 906 907 PetscFunctionBegin; 908 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 909 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 910 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 911 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 912 913 A->offloadmask = PETSC_OFFLOAD_BOTH; 914 915 /* lower triangular indices */ 916 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 917 if (!perm_identity) { 918 IS iip; 919 const PetscInt *irip,*rip; 920 921 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 922 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 923 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 924 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 925 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 926 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 927 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 928 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 929 ierr = ISDestroy(&iip);CHKERRQ(ierr); 930 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 931 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 932 } 933 PetscFunctionReturn(0); 934 } 935 936 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 937 { 938 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 939 IS isrow = b->row,iscol = b->col; 940 PetscBool row_identity,col_identity; 941 PetscErrorCode ierr; 942 943 PetscFunctionBegin; 944 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 945 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 946 B->offloadmask = PETSC_OFFLOAD_CPU; 947 /* determine which version of MatSolve needs to be used. */ 948 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 949 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 950 if (row_identity && col_identity) { 951 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 952 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 953 B->ops->matsolve = NULL; 954 B->ops->matsolvetranspose = NULL; 955 } else { 956 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 957 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 958 B->ops->matsolve = NULL; 959 B->ops->matsolvetranspose = NULL; 960 } 961 962 /* get the triangular factors */ 963 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 964 PetscFunctionReturn(0); 965 } 966 967 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 968 { 969 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 970 IS ip = b->row; 971 PetscBool perm_identity; 972 PetscErrorCode ierr; 973 974 PetscFunctionBegin; 975 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 976 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 977 B->offloadmask = PETSC_OFFLOAD_CPU; 978 /* determine which version of MatSolve needs to be used. */ 979 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 980 if (perm_identity) { 981 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 982 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 983 B->ops->matsolve = NULL; 984 B->ops->matsolvetranspose = NULL; 985 } else { 986 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 987 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 988 B->ops->matsolve = NULL; 989 B->ops->matsolvetranspose = NULL; 990 } 991 992 /* get the triangular factors */ 993 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 994 PetscFunctionReturn(0); 995 } 996 997 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 998 { 999 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1000 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1001 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1002 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1003 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1004 cusparseStatus_t stat; 1005 cusparseIndexBase_t indexBase; 1006 cusparseMatrixType_t matrixType; 1007 cusparseFillMode_t fillMode; 1008 cusparseDiagType_t diagType; 1009 cudaError_t cerr; 1010 PetscErrorCode ierr; 1011 1012 PetscFunctionBegin; 1013 /* allocate space for the transpose of the lower triangular factor */ 1014 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1015 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1016 1017 /* set the matrix descriptors of the lower triangular factor */ 1018 matrixType = cusparseGetMatType(loTriFactor->descr); 1019 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1020 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1021 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1022 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1023 1024 /* Create the matrix description */ 1025 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1026 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1027 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1028 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1029 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1030 1031 /* set the operation */ 1032 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1033 1034 /* allocate GPU space for the CSC of the lower triangular factor*/ 1035 loTriFactorT->csrMat = new CsrMatrix; 1036 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1037 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1038 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1039 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1040 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1041 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1042 1043 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1044 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1045 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1046 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1047 loTriFactor->csrMat->values->data().get(), 1048 loTriFactor->csrMat->row_offsets->data().get(), 1049 loTriFactor->csrMat->column_indices->data().get(), 1050 loTriFactorT->csrMat->values->data().get(), 1051 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1052 CUSPARSE_ACTION_NUMERIC,indexBase, 1053 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1054 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1055 #endif 1056 1057 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1058 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1059 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1060 loTriFactor->csrMat->values->data().get(), 1061 loTriFactor->csrMat->row_offsets->data().get(), 1062 loTriFactor->csrMat->column_indices->data().get(), 1063 loTriFactorT->csrMat->values->data().get(), 1064 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1065 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1066 CUSPARSE_ACTION_NUMERIC, indexBase, 1067 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 1068 #else 1069 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1070 CUSPARSE_ACTION_NUMERIC, indexBase 1071 #endif 1072 );CHKERRCUSPARSE(stat); 1073 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1074 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1075 1076 /* Create the solve analysis information */ 1077 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1078 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1079 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1080 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1081 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1082 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1083 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1084 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1085 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1086 #endif 1087 1088 /* perform the solve analysis */ 1089 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1090 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1091 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1092 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 1093 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1094 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1095 #endif 1096 );CHKERRCUSPARSE(stat); 1097 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1098 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1099 1100 /* assign the pointer */ 1101 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1102 1103 /*********************************************/ 1104 /* Now the Transpose of the Upper Tri Factor */ 1105 /*********************************************/ 1106 1107 /* allocate space for the transpose of the upper triangular factor */ 1108 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1109 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1110 1111 /* set the matrix descriptors of the upper triangular factor */ 1112 matrixType = cusparseGetMatType(upTriFactor->descr); 1113 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1114 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1115 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1116 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1117 1118 /* Create the matrix description */ 1119 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1120 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1121 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1122 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1123 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1124 1125 /* set the operation */ 1126 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1127 1128 /* allocate GPU space for the CSC of the upper triangular factor*/ 1129 upTriFactorT->csrMat = new CsrMatrix; 1130 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1131 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1132 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1133 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1134 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1135 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1136 1137 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1138 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1139 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1140 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1141 upTriFactor->csrMat->values->data().get(), 1142 upTriFactor->csrMat->row_offsets->data().get(), 1143 upTriFactor->csrMat->column_indices->data().get(), 1144 upTriFactorT->csrMat->values->data().get(), 1145 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1146 CUSPARSE_ACTION_NUMERIC,indexBase, 1147 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1148 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1149 #endif 1150 1151 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1152 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1153 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1154 upTriFactor->csrMat->values->data().get(), 1155 upTriFactor->csrMat->row_offsets->data().get(), 1156 upTriFactor->csrMat->column_indices->data().get(), 1157 upTriFactorT->csrMat->values->data().get(), 1158 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1159 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1160 CUSPARSE_ACTION_NUMERIC, indexBase, 1161 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1162 #else 1163 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1164 CUSPARSE_ACTION_NUMERIC, indexBase 1165 #endif 1166 );CHKERRCUSPARSE(stat); 1167 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1168 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1169 1170 /* Create the solve analysis information */ 1171 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1172 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1173 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1174 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1175 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1176 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1177 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1178 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1179 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1180 #endif 1181 1182 /* perform the solve analysis */ 1183 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1184 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1185 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1186 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1187 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1188 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1189 #endif 1190 );CHKERRCUSPARSE(stat); 1191 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1192 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1193 1194 /* assign the pointer */ 1195 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1196 PetscFunctionReturn(0); 1197 } 1198 1199 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 1200 { 1201 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1202 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1203 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1204 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1205 cusparseStatus_t stat; 1206 cusparseIndexBase_t indexBase; 1207 cudaError_t err; 1208 PetscErrorCode ierr; 1209 1210 PetscFunctionBegin; 1211 if (!cusparsestruct->transgen || cusparsestruct->matTranspose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 1212 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1213 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1214 /* create cusparse matrix */ 1215 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1216 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1217 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1218 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1219 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1220 1221 /* set alpha and beta */ 1222 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1223 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1224 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1225 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1226 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1227 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1228 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1229 1230 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1231 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1232 CsrMatrix *matrixT= new CsrMatrix; 1233 matrixT->num_rows = A->cmap->n; 1234 matrixT->num_cols = A->rmap->n; 1235 matrixT->num_entries = a->nz; 1236 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1237 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1238 matrixT->values = new THRUSTARRAY(a->nz); 1239 1240 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1241 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1242 1243 /* compute the transpose, i.e. the CSC */ 1244 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1245 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1246 A->cmap->n, matrix->num_entries, 1247 matrix->values->data().get(), 1248 cusparsestruct->rowoffsets_gpu->data().get(), 1249 matrix->column_indices->data().get(), 1250 matrixT->values->data().get(), 1251 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1252 CUSPARSE_ACTION_NUMERIC,indexBase, 1253 cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1254 err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err); 1255 #endif 1256 1257 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1258 A->cmap->n, matrix->num_entries, 1259 matrix->values->data().get(), 1260 cusparsestruct->rowoffsets_gpu->data().get(), 1261 matrix->column_indices->data().get(), 1262 matrixT->values->data().get(), 1263 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1264 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1265 CUSPARSE_ACTION_NUMERIC,indexBase, 1266 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1267 #else 1268 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1269 CUSPARSE_ACTION_NUMERIC, indexBase 1270 #endif 1271 );CHKERRCUSPARSE(stat); 1272 matstructT->mat = matrixT; 1273 1274 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1275 stat = cusparseCreateCsr(&matstructT->matDescr, 1276 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1277 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1278 matrixT->values->data().get(), 1279 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1280 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1281 #endif 1282 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1283 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1284 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1285 #else 1286 CsrMatrix *temp = new CsrMatrix; 1287 CsrMatrix *tempT = new CsrMatrix; 1288 /* First convert HYB to CSR */ 1289 temp->num_rows = A->rmap->n; 1290 temp->num_cols = A->cmap->n; 1291 temp->num_entries = a->nz; 1292 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1293 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1294 temp->values = new THRUSTARRAY(a->nz); 1295 1296 stat = cusparse_hyb2csr(cusparsestruct->handle, 1297 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1298 temp->values->data().get(), 1299 temp->row_offsets->data().get(), 1300 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1301 1302 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1303 tempT->num_rows = A->rmap->n; 1304 tempT->num_cols = A->cmap->n; 1305 tempT->num_entries = a->nz; 1306 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1307 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1308 tempT->values = new THRUSTARRAY(a->nz); 1309 1310 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1311 temp->num_cols, temp->num_entries, 1312 temp->values->data().get(), 1313 temp->row_offsets->data().get(), 1314 temp->column_indices->data().get(), 1315 tempT->values->data().get(), 1316 tempT->column_indices->data().get(), 1317 tempT->row_offsets->data().get(), 1318 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1319 1320 /* Last, convert CSC to HYB */ 1321 cusparseHybMat_t hybMat; 1322 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1323 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1324 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1325 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1326 matstructT->descr, tempT->values->data().get(), 1327 tempT->row_offsets->data().get(), 1328 tempT->column_indices->data().get(), 1329 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1330 1331 /* assign the pointer */ 1332 matstructT->mat = hybMat; 1333 /* delete temporaries */ 1334 if (tempT) { 1335 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1336 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1337 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1338 delete (CsrMatrix*) tempT; 1339 } 1340 if (temp) { 1341 if (temp->values) delete (THRUSTARRAY*) temp->values; 1342 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1343 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1344 delete (CsrMatrix*) temp; 1345 } 1346 #endif 1347 } 1348 err = WaitForCUDA();CHKERRCUDA(err); 1349 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1350 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1351 /* the compressed row indices is not used for matTranspose */ 1352 matstructT->cprowIndices = NULL; 1353 /* assign the pointer */ 1354 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1355 PetscFunctionReturn(0); 1356 } 1357 1358 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1359 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1360 { 1361 PetscInt n = xx->map->n; 1362 const PetscScalar *barray; 1363 PetscScalar *xarray; 1364 thrust::device_ptr<const PetscScalar> bGPU; 1365 thrust::device_ptr<PetscScalar> xGPU; 1366 cusparseStatus_t stat; 1367 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1368 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1369 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1370 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1371 PetscErrorCode ierr; 1372 cudaError_t cerr; 1373 1374 PetscFunctionBegin; 1375 /* Analyze the matrix and create the transpose ... on the fly */ 1376 if (!loTriFactorT && !upTriFactorT) { 1377 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1378 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1379 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1380 } 1381 1382 /* Get the GPU pointers */ 1383 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1384 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1385 xGPU = thrust::device_pointer_cast(xarray); 1386 bGPU = thrust::device_pointer_cast(barray); 1387 1388 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1389 /* First, reorder with the row permutation */ 1390 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1391 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1392 xGPU); 1393 1394 /* First, solve U */ 1395 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1396 upTriFactorT->csrMat->num_rows, 1397 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1398 upTriFactorT->csrMat->num_entries, 1399 #endif 1400 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1401 upTriFactorT->csrMat->values->data().get(), 1402 upTriFactorT->csrMat->row_offsets->data().get(), 1403 upTriFactorT->csrMat->column_indices->data().get(), 1404 upTriFactorT->solveInfo, 1405 xarray, tempGPU->data().get() 1406 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1407 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1408 #endif 1409 );CHKERRCUSPARSE(stat); 1410 1411 /* Then, solve L */ 1412 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1413 loTriFactorT->csrMat->num_rows, 1414 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1415 loTriFactorT->csrMat->num_entries, 1416 #endif 1417 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1418 loTriFactorT->csrMat->values->data().get(), 1419 loTriFactorT->csrMat->row_offsets->data().get(), 1420 loTriFactorT->csrMat->column_indices->data().get(), 1421 loTriFactorT->solveInfo, 1422 tempGPU->data().get(), xarray 1423 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1424 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1425 #endif 1426 );CHKERRCUSPARSE(stat); 1427 1428 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1429 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1430 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1431 tempGPU->begin()); 1432 1433 /* Copy the temporary to the full solution. */ 1434 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1435 1436 /* restore */ 1437 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1438 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1439 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1440 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1441 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1442 PetscFunctionReturn(0); 1443 } 1444 1445 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1446 { 1447 const PetscScalar *barray; 1448 PetscScalar *xarray; 1449 cusparseStatus_t stat; 1450 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1451 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1452 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1453 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1454 PetscErrorCode ierr; 1455 cudaError_t cerr; 1456 1457 PetscFunctionBegin; 1458 /* Analyze the matrix and create the transpose ... on the fly */ 1459 if (!loTriFactorT && !upTriFactorT) { 1460 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1461 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1462 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1463 } 1464 1465 /* Get the GPU pointers */ 1466 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1467 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1468 1469 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1470 /* First, solve U */ 1471 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1472 upTriFactorT->csrMat->num_rows, 1473 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1474 upTriFactorT->csrMat->num_entries, 1475 #endif 1476 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1477 upTriFactorT->csrMat->values->data().get(), 1478 upTriFactorT->csrMat->row_offsets->data().get(), 1479 upTriFactorT->csrMat->column_indices->data().get(), 1480 upTriFactorT->solveInfo, 1481 barray, tempGPU->data().get() 1482 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1483 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1484 #endif 1485 );CHKERRCUSPARSE(stat); 1486 1487 /* Then, solve L */ 1488 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1489 loTriFactorT->csrMat->num_rows, 1490 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1491 loTriFactorT->csrMat->num_entries, 1492 #endif 1493 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1494 loTriFactorT->csrMat->values->data().get(), 1495 loTriFactorT->csrMat->row_offsets->data().get(), 1496 loTriFactorT->csrMat->column_indices->data().get(), 1497 loTriFactorT->solveInfo, 1498 tempGPU->data().get(), xarray 1499 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1500 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1501 #endif 1502 );CHKERRCUSPARSE(stat); 1503 1504 /* restore */ 1505 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1506 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1507 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1508 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1509 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1510 PetscFunctionReturn(0); 1511 } 1512 1513 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1514 { 1515 const PetscScalar *barray; 1516 PetscScalar *xarray; 1517 thrust::device_ptr<const PetscScalar> bGPU; 1518 thrust::device_ptr<PetscScalar> xGPU; 1519 cusparseStatus_t stat; 1520 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1521 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1522 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1523 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1524 PetscErrorCode ierr; 1525 cudaError_t cerr; 1526 1527 PetscFunctionBegin; 1528 1529 /* Get the GPU pointers */ 1530 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1531 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1532 xGPU = thrust::device_pointer_cast(xarray); 1533 bGPU = thrust::device_pointer_cast(barray); 1534 1535 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1536 /* First, reorder with the row permutation */ 1537 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1538 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1539 tempGPU->begin()); 1540 1541 /* Next, solve L */ 1542 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1543 loTriFactor->csrMat->num_rows, 1544 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1545 loTriFactor->csrMat->num_entries, 1546 #endif 1547 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1548 loTriFactor->csrMat->values->data().get(), 1549 loTriFactor->csrMat->row_offsets->data().get(), 1550 loTriFactor->csrMat->column_indices->data().get(), 1551 loTriFactor->solveInfo, 1552 tempGPU->data().get(), xarray 1553 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1554 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1555 #endif 1556 );CHKERRCUSPARSE(stat); 1557 1558 /* Then, solve U */ 1559 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1560 upTriFactor->csrMat->num_rows, 1561 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1562 upTriFactor->csrMat->num_entries, 1563 #endif 1564 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1565 upTriFactor->csrMat->values->data().get(), 1566 upTriFactor->csrMat->row_offsets->data().get(), 1567 upTriFactor->csrMat->column_indices->data().get(), 1568 upTriFactor->solveInfo, 1569 xarray, tempGPU->data().get() 1570 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1571 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1572 #endif 1573 );CHKERRCUSPARSE(stat); 1574 1575 /* Last, reorder with the column permutation */ 1576 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1577 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1578 xGPU); 1579 1580 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1581 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1582 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1583 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1584 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1585 PetscFunctionReturn(0); 1586 } 1587 1588 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1589 { 1590 const PetscScalar *barray; 1591 PetscScalar *xarray; 1592 cusparseStatus_t stat; 1593 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1594 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1595 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1596 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1597 PetscErrorCode ierr; 1598 cudaError_t cerr; 1599 1600 PetscFunctionBegin; 1601 /* Get the GPU pointers */ 1602 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1603 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1604 1605 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1606 /* First, solve L */ 1607 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1608 loTriFactor->csrMat->num_rows, 1609 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1610 loTriFactor->csrMat->num_entries, 1611 #endif 1612 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1613 loTriFactor->csrMat->values->data().get(), 1614 loTriFactor->csrMat->row_offsets->data().get(), 1615 loTriFactor->csrMat->column_indices->data().get(), 1616 loTriFactor->solveInfo, 1617 barray, tempGPU->data().get() 1618 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1619 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1620 #endif 1621 );CHKERRCUSPARSE(stat); 1622 1623 /* Next, solve U */ 1624 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1625 upTriFactor->csrMat->num_rows, 1626 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1627 upTriFactor->csrMat->num_entries, 1628 #endif 1629 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1630 upTriFactor->csrMat->values->data().get(), 1631 upTriFactor->csrMat->row_offsets->data().get(), 1632 upTriFactor->csrMat->column_indices->data().get(), 1633 upTriFactor->solveInfo, 1634 tempGPU->data().get(), xarray 1635 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1636 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1637 #endif 1638 );CHKERRCUSPARSE(stat); 1639 1640 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1641 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1642 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1643 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1644 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1645 PetscFunctionReturn(0); 1646 } 1647 1648 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1649 { 1650 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1651 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1652 cudaError_t cerr; 1653 PetscErrorCode ierr; 1654 1655 PetscFunctionBegin; 1656 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1657 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1658 1659 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1660 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1661 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1662 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1663 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1664 A->offloadmask = PETSC_OFFLOAD_BOTH; 1665 } 1666 PetscFunctionReturn(0); 1667 } 1668 1669 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1670 { 1671 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1672 PetscErrorCode ierr; 1673 1674 PetscFunctionBegin; 1675 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1676 *array = a->a; 1677 A->offloadmask = PETSC_OFFLOAD_CPU; 1678 PetscFunctionReturn(0); 1679 } 1680 1681 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1682 { 1683 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1684 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1685 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1686 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1687 PetscErrorCode ierr; 1688 cusparseStatus_t stat; 1689 PetscBool both = PETSC_TRUE; 1690 cudaError_t err; 1691 1692 PetscFunctionBegin; 1693 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU"); 1694 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1695 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1696 /* Copy values only */ 1697 CsrMatrix *matrix,*matrixT; 1698 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1699 1700 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values"); 1701 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1702 matrix->values->assign(a->a, a->a+a->nz); 1703 err = WaitForCUDA();CHKERRCUDA(err); 1704 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1705 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1706 1707 /* Update matT when it was built before */ 1708 if (cusparsestruct->matTranspose) { 1709 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1710 matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1711 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1712 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1713 A->cmap->n, matrix->num_entries, 1714 matrix->values->data().get(), 1715 cusparsestruct->rowoffsets_gpu->data().get(), 1716 matrix->column_indices->data().get(), 1717 matrixT->values->data().get(), 1718 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1719 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1720 CUSPARSE_ACTION_NUMERIC,indexBase, 1721 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1722 #else 1723 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1724 CUSPARSE_ACTION_NUMERIC, indexBase 1725 #endif 1726 );CHKERRCUSPARSE(stat); 1727 err = WaitForCUDA();CHKERRCUDA(err); 1728 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1729 } 1730 } else { 1731 PetscInt nnz; 1732 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1733 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1734 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1735 delete cusparsestruct->workVector; 1736 delete cusparsestruct->rowoffsets_gpu; 1737 try { 1738 if (a->compressedrow.use) { 1739 m = a->compressedrow.nrows; 1740 ii = a->compressedrow.i; 1741 ridx = a->compressedrow.rindex; 1742 } else { 1743 m = A->rmap->n; 1744 ii = a->i; 1745 ridx = NULL; 1746 } 1747 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data"); 1748 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data"); 1749 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1750 else nnz = a->nz; 1751 1752 /* create cusparse matrix */ 1753 cusparsestruct->nrows = m; 1754 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1755 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1756 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1757 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1758 1759 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1760 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1761 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1762 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1763 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1764 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1765 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1766 1767 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1768 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1769 /* set the matrix */ 1770 CsrMatrix *mat= new CsrMatrix; 1771 mat->num_rows = m; 1772 mat->num_cols = A->cmap->n; 1773 mat->num_entries = nnz; 1774 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1775 mat->row_offsets->assign(ii, ii + m+1); 1776 1777 mat->column_indices = new THRUSTINTARRAY32(nnz); 1778 mat->column_indices->assign(a->j, a->j+nnz); 1779 1780 mat->values = new THRUSTARRAY(nnz); 1781 if (a->a) mat->values->assign(a->a, a->a+nnz); 1782 1783 /* assign the pointer */ 1784 matstruct->mat = mat; 1785 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1786 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1787 stat = cusparseCreateCsr(&matstruct->matDescr, 1788 mat->num_rows, mat->num_cols, mat->num_entries, 1789 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1790 mat->values->data().get(), 1791 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1792 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1793 } 1794 #endif 1795 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1796 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1797 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1798 #else 1799 CsrMatrix *mat= new CsrMatrix; 1800 mat->num_rows = m; 1801 mat->num_cols = A->cmap->n; 1802 mat->num_entries = nnz; 1803 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1804 mat->row_offsets->assign(ii, ii + m+1); 1805 1806 mat->column_indices = new THRUSTINTARRAY32(nnz); 1807 mat->column_indices->assign(a->j, a->j+nnz); 1808 1809 mat->values = new THRUSTARRAY(nnz); 1810 if (a->a) mat->values->assign(a->a, a->a+nnz); 1811 1812 cusparseHybMat_t hybMat; 1813 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1814 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1815 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1816 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1817 matstruct->descr, mat->values->data().get(), 1818 mat->row_offsets->data().get(), 1819 mat->column_indices->data().get(), 1820 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1821 /* assign the pointer */ 1822 matstruct->mat = hybMat; 1823 1824 if (mat) { 1825 if (mat->values) delete (THRUSTARRAY*)mat->values; 1826 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1827 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1828 delete (CsrMatrix*)mat; 1829 } 1830 #endif 1831 } 1832 1833 /* assign the compressed row indices */ 1834 if (a->compressedrow.use) { 1835 cusparsestruct->workVector = new THRUSTARRAY(m); 1836 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1837 matstruct->cprowIndices->assign(ridx,ridx+m); 1838 tmp = m; 1839 } else { 1840 cusparsestruct->workVector = NULL; 1841 matstruct->cprowIndices = NULL; 1842 tmp = 0; 1843 } 1844 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1845 1846 /* assign the pointer */ 1847 cusparsestruct->mat = matstruct; 1848 } catch(char *ex) { 1849 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1850 } 1851 err = WaitForCUDA();CHKERRCUDA(err); 1852 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1853 cusparsestruct->nonzerostate = A->nonzerostate; 1854 } 1855 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1856 } 1857 PetscFunctionReturn(0); 1858 } 1859 1860 struct VecCUDAPlusEquals 1861 { 1862 template <typename Tuple> 1863 __host__ __device__ 1864 void operator()(Tuple t) 1865 { 1866 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1867 } 1868 }; 1869 1870 struct VecCUDAEquals 1871 { 1872 template <typename Tuple> 1873 __host__ __device__ 1874 void operator()(Tuple t) 1875 { 1876 thrust::get<1>(t) = thrust::get<0>(t); 1877 } 1878 }; 1879 1880 struct VecCUDAEqualsReverse 1881 { 1882 template <typename Tuple> 1883 __host__ __device__ 1884 void operator()(Tuple t) 1885 { 1886 thrust::get<0>(t) = thrust::get<1>(t); 1887 } 1888 }; 1889 1890 struct MatMatCusparse { 1891 PetscBool cisdense; 1892 PetscScalar *Bt; 1893 Mat X; 1894 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1895 PetscLogDouble flops; 1896 CsrMatrix *Bcsr; 1897 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1898 cusparseSpMatDescr_t matSpBDescr; 1899 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1900 cusparseDnMatDescr_t matBDescr; 1901 cusparseDnMatDescr_t matCDescr; 1902 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1903 size_t mmBufferSize; 1904 void *mmBuffer; 1905 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1906 cusparseSpGEMMDescr_t spgemmDesc; 1907 #endif 1908 }; 1909 1910 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1911 { 1912 PetscErrorCode ierr; 1913 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1914 cudaError_t cerr; 1915 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1916 cusparseStatus_t stat; 1917 #endif 1918 1919 PetscFunctionBegin; 1920 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1921 delete mmdata->Bcsr; 1922 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1923 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1924 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1925 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1926 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1927 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1928 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1929 #endif 1930 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1931 ierr = PetscFree(data);CHKERRQ(ierr); 1932 PetscFunctionReturn(0); 1933 } 1934 1935 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1936 1937 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1938 { 1939 Mat_Product *product = C->product; 1940 Mat A,B; 1941 PetscInt m,n,blda,clda; 1942 PetscBool flg,biscuda; 1943 Mat_SeqAIJCUSPARSE *cusp; 1944 cusparseStatus_t stat; 1945 cusparseOperation_t opA; 1946 const PetscScalar *barray; 1947 PetscScalar *carray; 1948 PetscErrorCode ierr; 1949 MatMatCusparse *mmdata; 1950 Mat_SeqAIJCUSPARSEMultStruct *mat; 1951 CsrMatrix *csrmat; 1952 cudaError_t cerr; 1953 1954 PetscFunctionBegin; 1955 MatCheckProduct(C,1); 1956 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1957 mmdata = (MatMatCusparse*)product->data; 1958 A = product->A; 1959 B = product->B; 1960 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1961 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1962 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1963 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1964 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1965 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1966 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1967 switch (product->type) { 1968 case MATPRODUCT_AB: 1969 case MATPRODUCT_PtAP: 1970 mat = cusp->mat; 1971 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1972 m = A->rmap->n; 1973 n = B->cmap->n; 1974 break; 1975 case MATPRODUCT_AtB: 1976 if (!cusp->transgen) { 1977 mat = cusp->mat; 1978 opA = CUSPARSE_OPERATION_TRANSPOSE; 1979 } else { 1980 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1981 mat = cusp->matTranspose; 1982 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1983 } 1984 m = A->cmap->n; 1985 n = B->cmap->n; 1986 break; 1987 case MATPRODUCT_ABt: 1988 case MATPRODUCT_RARt: 1989 mat = cusp->mat; 1990 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1991 m = A->rmap->n; 1992 n = B->rmap->n; 1993 break; 1994 default: 1995 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1996 } 1997 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1998 csrmat = (CsrMatrix*)mat->mat; 1999 /* if the user passed a CPU matrix, copy the data to the GPU */ 2000 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2001 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2002 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2003 2004 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2005 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2006 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2007 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2008 } else { 2009 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2010 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2011 } 2012 2013 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2014 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2015 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2016 /* (re)allcoate mmBuffer if not initialized or LDAs are different */ 2017 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2018 size_t mmBufferSize; 2019 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2020 if (!mmdata->matBDescr) { 2021 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2022 mmdata->Blda = blda; 2023 } 2024 2025 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2026 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2027 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2028 mmdata->Clda = clda; 2029 } 2030 2031 if (!mat->matDescr) { 2032 stat = cusparseCreateCsr(&mat->matDescr, 2033 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2034 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2035 csrmat->values->data().get(), 2036 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2037 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2038 } 2039 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2040 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2041 mmdata->matCDescr,cusparse_scalartype, 2042 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2043 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2044 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2045 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2046 mmdata->mmBufferSize = mmBufferSize; 2047 } 2048 mmdata->initialized = PETSC_TRUE; 2049 } else { 2050 /* to be safe, always update pointers of the mats */ 2051 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2052 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2053 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2054 } 2055 2056 /* do cusparseSpMM, which supports transpose on B */ 2057 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2058 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2059 mmdata->matCDescr,cusparse_scalartype, 2060 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2061 #else 2062 PetscInt k; 2063 /* cusparseXcsrmm does not support transpose on B */ 2064 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2065 cublasHandle_t cublasv2handle; 2066 cublasStatus_t cerr; 2067 2068 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2069 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2070 B->cmap->n,B->rmap->n, 2071 &PETSC_CUSPARSE_ONE ,barray,blda, 2072 &PETSC_CUSPARSE_ZERO,barray,blda, 2073 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2074 blda = B->cmap->n; 2075 k = B->cmap->n; 2076 } else { 2077 k = B->rmap->n; 2078 } 2079 2080 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2081 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2082 csrmat->num_entries,mat->alpha_one,mat->descr, 2083 csrmat->values->data().get(), 2084 csrmat->row_offsets->data().get(), 2085 csrmat->column_indices->data().get(), 2086 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2087 carray,clda);CHKERRCUSPARSE(stat); 2088 #endif 2089 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2090 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2091 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2092 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2093 if (product->type == MATPRODUCT_RARt) { 2094 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2095 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2096 } else if (product->type == MATPRODUCT_PtAP) { 2097 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2098 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2099 } else { 2100 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2101 } 2102 if (mmdata->cisdense) { 2103 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2104 } 2105 if (!biscuda) { 2106 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2107 } 2108 PetscFunctionReturn(0); 2109 } 2110 2111 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2112 { 2113 Mat_Product *product = C->product; 2114 Mat A,B; 2115 PetscInt m,n; 2116 PetscBool cisdense,flg; 2117 PetscErrorCode ierr; 2118 MatMatCusparse *mmdata; 2119 Mat_SeqAIJCUSPARSE *cusp; 2120 2121 PetscFunctionBegin; 2122 MatCheckProduct(C,1); 2123 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2124 A = product->A; 2125 B = product->B; 2126 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2127 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2128 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2129 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2130 switch (product->type) { 2131 case MATPRODUCT_AB: 2132 m = A->rmap->n; 2133 n = B->cmap->n; 2134 break; 2135 case MATPRODUCT_AtB: 2136 m = A->cmap->n; 2137 n = B->cmap->n; 2138 break; 2139 case MATPRODUCT_ABt: 2140 m = A->rmap->n; 2141 n = B->rmap->n; 2142 break; 2143 case MATPRODUCT_PtAP: 2144 m = B->cmap->n; 2145 n = B->cmap->n; 2146 break; 2147 case MATPRODUCT_RARt: 2148 m = B->rmap->n; 2149 n = B->rmap->n; 2150 break; 2151 default: 2152 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2153 } 2154 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2155 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2156 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2157 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2158 2159 /* product data */ 2160 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2161 mmdata->cisdense = cisdense; 2162 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2163 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2164 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2165 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2166 } 2167 #endif 2168 /* for these products we need intermediate storage */ 2169 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2170 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2171 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2172 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2173 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2174 } else { 2175 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2176 } 2177 } 2178 C->product->data = mmdata; 2179 C->product->destroy = MatDestroy_MatMatCusparse; 2180 2181 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2182 PetscFunctionReturn(0); 2183 } 2184 2185 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2186 { 2187 Mat_Product *product = C->product; 2188 Mat A,B; 2189 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2190 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2191 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2192 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2193 PetscBool flg; 2194 PetscErrorCode ierr; 2195 cusparseStatus_t stat; 2196 cudaError_t cerr; 2197 MatProductType ptype; 2198 MatMatCusparse *mmdata; 2199 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2200 cusparseSpMatDescr_t BmatSpDescr; 2201 #endif 2202 2203 PetscFunctionBegin; 2204 MatCheckProduct(C,1); 2205 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 2206 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2207 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name); 2208 mmdata = (MatMatCusparse*)C->product->data; 2209 A = product->A; 2210 B = product->B; 2211 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2212 mmdata->reusesym = PETSC_FALSE; 2213 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2214 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2215 Cmat = Ccusp->mat; 2216 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2217 Ccsr = (CsrMatrix*)Cmat->mat; 2218 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2219 goto finalize; 2220 } 2221 if (!c->nz) goto finalize; 2222 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2223 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2224 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2225 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2226 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2227 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2228 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2229 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2230 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2231 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2232 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2233 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2234 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2235 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2236 2237 ptype = product->type; 2238 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2239 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2240 switch (ptype) { 2241 case MATPRODUCT_AB: 2242 Amat = Acusp->mat; 2243 Bmat = Bcusp->mat; 2244 break; 2245 case MATPRODUCT_AtB: 2246 Amat = Acusp->matTranspose; 2247 Bmat = Bcusp->mat; 2248 break; 2249 case MATPRODUCT_ABt: 2250 Amat = Acusp->mat; 2251 Bmat = Bcusp->matTranspose; 2252 break; 2253 default: 2254 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2255 } 2256 Cmat = Ccusp->mat; 2257 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2258 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2259 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2260 Acsr = (CsrMatrix*)Amat->mat; 2261 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2262 Ccsr = (CsrMatrix*)Cmat->mat; 2263 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2264 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2265 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2266 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2267 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2268 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2269 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2270 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2271 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2272 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2273 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2274 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2275 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2276 #else 2277 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2278 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2279 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2280 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2281 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2282 #endif 2283 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2284 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2285 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2286 C->offloadmask = PETSC_OFFLOAD_GPU; 2287 finalize: 2288 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2289 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2290 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2291 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2292 c->reallocs = 0; 2293 C->info.mallocs += 0; 2294 C->info.nz_unneeded = 0; 2295 C->assembled = C->was_assembled = PETSC_TRUE; 2296 C->num_ass++; 2297 PetscFunctionReturn(0); 2298 } 2299 2300 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2301 { 2302 Mat_Product *product = C->product; 2303 Mat A,B; 2304 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2305 Mat_SeqAIJ *a,*b,*c; 2306 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2307 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2308 PetscInt i,j,m,n,k; 2309 PetscBool flg; 2310 PetscErrorCode ierr; 2311 cusparseStatus_t stat; 2312 cudaError_t cerr; 2313 MatProductType ptype; 2314 MatMatCusparse *mmdata; 2315 PetscLogDouble flops; 2316 PetscBool biscompressed,ciscompressed; 2317 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2318 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2319 size_t bufSize2; 2320 cusparseSpMatDescr_t BmatSpDescr; 2321 #else 2322 int cnz; 2323 #endif 2324 2325 PetscFunctionBegin; 2326 MatCheckProduct(C,1); 2327 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2328 A = product->A; 2329 B = product->B; 2330 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2331 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2332 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2333 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2334 a = (Mat_SeqAIJ*)A->data; 2335 b = (Mat_SeqAIJ*)B->data; 2336 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2337 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2338 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2339 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2340 2341 /* product data */ 2342 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2343 C->product->data = mmdata; 2344 C->product->destroy = MatDestroy_MatMatCusparse; 2345 2346 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2347 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2348 ptype = product->type; 2349 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2350 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2351 biscompressed = PETSC_FALSE; 2352 ciscompressed = PETSC_FALSE; 2353 switch (ptype) { 2354 case MATPRODUCT_AB: 2355 m = A->rmap->n; 2356 n = B->cmap->n; 2357 k = A->cmap->n; 2358 Amat = Acusp->mat; 2359 Bmat = Bcusp->mat; 2360 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2361 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2362 break; 2363 case MATPRODUCT_AtB: 2364 m = A->cmap->n; 2365 n = B->cmap->n; 2366 k = A->rmap->n; 2367 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 2368 Amat = Acusp->matTranspose; 2369 Bmat = Bcusp->mat; 2370 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2371 break; 2372 case MATPRODUCT_ABt: 2373 m = A->rmap->n; 2374 n = B->rmap->n; 2375 k = A->cmap->n; 2376 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr); 2377 Amat = Acusp->mat; 2378 Bmat = Bcusp->matTranspose; 2379 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2380 break; 2381 default: 2382 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2383 } 2384 2385 /* create cusparse matrix */ 2386 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2387 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2388 c = (Mat_SeqAIJ*)C->data; 2389 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2390 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2391 Ccsr = new CsrMatrix; 2392 2393 c->compressedrow.use = ciscompressed; 2394 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2395 c->compressedrow.nrows = a->compressedrow.nrows; 2396 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2397 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2398 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2399 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2400 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2401 } else { 2402 c->compressedrow.nrows = 0; 2403 c->compressedrow.i = NULL; 2404 c->compressedrow.rindex = NULL; 2405 Ccusp->workVector = NULL; 2406 Cmat->cprowIndices = NULL; 2407 } 2408 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2409 Ccusp->mat = Cmat; 2410 Ccusp->mat->mat = Ccsr; 2411 Ccsr->num_rows = Ccusp->nrows; 2412 Ccsr->num_cols = n; 2413 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2414 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2415 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2416 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2417 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2418 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2419 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2420 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2421 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2422 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2423 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2424 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2425 c->nz = 0; 2426 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2427 Ccsr->values = new THRUSTARRAY(c->nz); 2428 goto finalizesym; 2429 } 2430 2431 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2432 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2433 Acsr = (CsrMatrix*)Amat->mat; 2434 if (!biscompressed) { 2435 Bcsr = (CsrMatrix*)Bmat->mat; 2436 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2437 BmatSpDescr = Bmat->matDescr; 2438 #endif 2439 } else { /* we need to use row offsets for the full matrix */ 2440 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2441 Bcsr = new CsrMatrix; 2442 Bcsr->num_rows = B->rmap->n; 2443 Bcsr->num_cols = cBcsr->num_cols; 2444 Bcsr->num_entries = cBcsr->num_entries; 2445 Bcsr->column_indices = cBcsr->column_indices; 2446 Bcsr->values = cBcsr->values; 2447 if (!Bcusp->rowoffsets_gpu) { 2448 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2449 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2450 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2451 } 2452 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2453 mmdata->Bcsr = Bcsr; 2454 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2455 if (Bcsr->num_rows && Bcsr->num_cols) { 2456 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2457 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2458 Bcsr->values->data().get(), 2459 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2460 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2461 } 2462 BmatSpDescr = mmdata->matSpBDescr; 2463 #endif 2464 } 2465 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2466 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2467 /* precompute flops count */ 2468 if (ptype == MATPRODUCT_AB) { 2469 for (i=0, flops = 0; i<A->rmap->n; i++) { 2470 const PetscInt st = a->i[i]; 2471 const PetscInt en = a->i[i+1]; 2472 for (j=st; j<en; j++) { 2473 const PetscInt brow = a->j[j]; 2474 flops += 2.*(b->i[brow+1] - b->i[brow]); 2475 } 2476 } 2477 } else if (ptype == MATPRODUCT_AtB) { 2478 for (i=0, flops = 0; i<A->rmap->n; i++) { 2479 const PetscInt anzi = a->i[i+1] - a->i[i]; 2480 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2481 flops += (2.*anzi)*bnzi; 2482 } 2483 } else { /* TODO */ 2484 flops = 0.; 2485 } 2486 2487 mmdata->flops = flops; 2488 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2489 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2490 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2491 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2492 NULL, NULL, NULL, 2493 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2494 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2495 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2496 /* ask bufferSize bytes for external memory */ 2497 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2498 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2499 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2500 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2501 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2502 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2503 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2504 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2505 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2506 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2507 /* ask bufferSize again bytes for external memory */ 2508 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2509 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2510 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2511 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2512 /* The CUSPARSE documentation is not clear, nor the API 2513 We need both buffers to perform the operations properly! 2514 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2515 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2516 is stored in the descriptor! What a messy API... */ 2517 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2518 /* compute the intermediate product of A * B */ 2519 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2520 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2521 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2522 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2523 /* get matrix C non-zero entries C_nnz1 */ 2524 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2525 c->nz = (PetscInt) C_nnz1; 2526 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2527 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2528 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2529 Ccsr->values = new THRUSTARRAY(c->nz); 2530 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2531 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2532 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2533 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2534 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2535 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2536 #else 2537 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2538 stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2539 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2540 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2541 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2542 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2543 c->nz = cnz; 2544 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2545 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2546 Ccsr->values = new THRUSTARRAY(c->nz); 2547 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2548 2549 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2550 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2551 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2552 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2553 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2554 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2555 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2556 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2557 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2558 #endif 2559 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2560 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2561 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2562 finalizesym: 2563 c->singlemalloc = PETSC_FALSE; 2564 c->free_a = PETSC_TRUE; 2565 c->free_ij = PETSC_TRUE; 2566 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2567 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2568 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2569 PetscInt *d_i = c->i; 2570 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2571 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2572 ii = *Ccsr->row_offsets; 2573 jj = *Ccsr->column_indices; 2574 if (ciscompressed) d_i = c->compressedrow.i; 2575 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2576 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2577 } else { 2578 PetscInt *d_i = c->i; 2579 if (ciscompressed) d_i = c->compressedrow.i; 2580 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2581 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2582 } 2583 if (ciscompressed) { /* need to expand host row offsets */ 2584 PetscInt r = 0; 2585 c->i[0] = 0; 2586 for (k = 0; k < c->compressedrow.nrows; k++) { 2587 const PetscInt next = c->compressedrow.rindex[k]; 2588 const PetscInt old = c->compressedrow.i[k]; 2589 for (; r < next; r++) c->i[r+1] = old; 2590 } 2591 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2592 } 2593 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2594 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2595 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2596 c->maxnz = c->nz; 2597 c->nonzerorowcnt = 0; 2598 c->rmax = 0; 2599 for (k = 0; k < m; k++) { 2600 const PetscInt nn = c->i[k+1] - c->i[k]; 2601 c->ilen[k] = c->imax[k] = nn; 2602 c->nonzerorowcnt += (PetscInt)!!nn; 2603 c->rmax = PetscMax(c->rmax,nn); 2604 } 2605 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2606 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2607 Ccsr->num_entries = c->nz; 2608 2609 C->nonzerostate++; 2610 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2611 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2612 Ccusp->nonzerostate = C->nonzerostate; 2613 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2614 C->preallocated = PETSC_TRUE; 2615 C->assembled = PETSC_FALSE; 2616 C->was_assembled = PETSC_FALSE; 2617 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2618 mmdata->reusesym = PETSC_TRUE; 2619 C->offloadmask = PETSC_OFFLOAD_GPU; 2620 } 2621 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2622 PetscFunctionReturn(0); 2623 } 2624 2625 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2626 2627 /* handles sparse or dense B */ 2628 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2629 { 2630 Mat_Product *product = mat->product; 2631 PetscErrorCode ierr; 2632 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2633 2634 PetscFunctionBegin; 2635 MatCheckProduct(mat,1); 2636 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2637 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2638 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2639 } 2640 if (product->type == MATPRODUCT_ABC) { 2641 Ciscusp = PETSC_FALSE; 2642 if (!product->C->boundtocpu) { 2643 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2644 } 2645 } 2646 if (isdense) { 2647 switch (product->type) { 2648 case MATPRODUCT_AB: 2649 case MATPRODUCT_AtB: 2650 case MATPRODUCT_ABt: 2651 case MATPRODUCT_PtAP: 2652 case MATPRODUCT_RARt: 2653 if (product->A->boundtocpu) { 2654 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2655 } else { 2656 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2657 } 2658 break; 2659 case MATPRODUCT_ABC: 2660 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2661 break; 2662 default: 2663 break; 2664 } 2665 } else if (Biscusp && Ciscusp) { 2666 switch (product->type) { 2667 case MATPRODUCT_AB: 2668 case MATPRODUCT_AtB: 2669 case MATPRODUCT_ABt: 2670 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2671 break; 2672 case MATPRODUCT_PtAP: 2673 case MATPRODUCT_RARt: 2674 case MATPRODUCT_ABC: 2675 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2676 break; 2677 default: 2678 break; 2679 } 2680 } else { /* fallback for AIJ */ 2681 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2682 } 2683 PetscFunctionReturn(0); 2684 } 2685 2686 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2687 { 2688 PetscErrorCode ierr; 2689 2690 PetscFunctionBegin; 2691 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2692 PetscFunctionReturn(0); 2693 } 2694 2695 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2696 { 2697 PetscErrorCode ierr; 2698 2699 PetscFunctionBegin; 2700 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2701 PetscFunctionReturn(0); 2702 } 2703 2704 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2705 { 2706 PetscErrorCode ierr; 2707 2708 PetscFunctionBegin; 2709 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2710 PetscFunctionReturn(0); 2711 } 2712 2713 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2714 { 2715 PetscErrorCode ierr; 2716 2717 PetscFunctionBegin; 2718 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2719 PetscFunctionReturn(0); 2720 } 2721 2722 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2723 { 2724 PetscErrorCode ierr; 2725 2726 PetscFunctionBegin; 2727 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2728 PetscFunctionReturn(0); 2729 } 2730 2731 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2732 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2733 { 2734 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2735 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2736 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2737 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2738 PetscErrorCode ierr; 2739 cudaError_t cerr; 2740 cusparseStatus_t stat; 2741 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2742 PetscBool compressed; 2743 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2744 PetscInt nx,ny; 2745 #endif 2746 2747 PetscFunctionBegin; 2748 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2749 if (!a->nonzerorowcnt) { 2750 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2751 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2752 PetscFunctionReturn(0); 2753 } 2754 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2755 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2756 if (!trans) { 2757 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2758 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2759 } else { 2760 if (herm || !cusparsestruct->transgen) { 2761 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2762 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2763 } else { 2764 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);} 2765 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2766 } 2767 } 2768 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2769 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2770 2771 try { 2772 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2773 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2774 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2775 2776 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2777 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2778 /* z = A x + beta y. 2779 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2780 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2781 */ 2782 xptr = xarray; 2783 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2784 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2785 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2786 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2787 allocated to accommodate different uses. So we get the length info directly from mat. 2788 */ 2789 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2790 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2791 nx = mat->num_cols; 2792 ny = mat->num_rows; 2793 } 2794 #endif 2795 } else { 2796 /* z = A^T x + beta y 2797 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2798 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2799 */ 2800 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2801 dptr = zarray; 2802 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2803 if (compressed) { /* Scatter x to work vector */ 2804 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2805 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2806 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2807 VecCUDAEqualsReverse()); 2808 } 2809 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2810 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2811 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2812 nx = mat->num_rows; 2813 ny = mat->num_cols; 2814 } 2815 #endif 2816 } 2817 2818 /* csr_spmv does y = alpha op(A) x + beta y */ 2819 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2820 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2821 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2822 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2823 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2824 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2825 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2826 matstruct->matDescr, 2827 matstruct->cuSpMV[opA].vecXDescr, beta, 2828 matstruct->cuSpMV[opA].vecYDescr, 2829 cusparse_scalartype, 2830 cusparsestruct->spmvAlg, 2831 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2832 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2833 2834 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2835 } else { 2836 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2837 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2838 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2839 } 2840 2841 stat = cusparseSpMV(cusparsestruct->handle, opA, 2842 matstruct->alpha_one, 2843 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */ 2844 matstruct->cuSpMV[opA].vecXDescr, 2845 beta, 2846 matstruct->cuSpMV[opA].vecYDescr, 2847 cusparse_scalartype, 2848 cusparsestruct->spmvAlg, 2849 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2850 #else 2851 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2852 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2853 mat->num_rows, mat->num_cols, 2854 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2855 mat->values->data().get(), mat->row_offsets->data().get(), 2856 mat->column_indices->data().get(), xptr, beta, 2857 dptr);CHKERRCUSPARSE(stat); 2858 #endif 2859 } else { 2860 if (cusparsestruct->nrows) { 2861 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2862 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2863 #else 2864 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2865 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2866 matstruct->alpha_one, matstruct->descr, hybMat, 2867 xptr, beta, 2868 dptr);CHKERRCUSPARSE(stat); 2869 #endif 2870 } 2871 } 2872 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2873 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2874 2875 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2876 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2877 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2878 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2879 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2880 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2881 } 2882 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2883 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2884 } 2885 2886 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2887 if (compressed) { 2888 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2889 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2890 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2891 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2892 VecCUDAPlusEquals()); 2893 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2894 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2895 } 2896 } else { 2897 if (yy && yy != zz) { 2898 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2899 } 2900 } 2901 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2902 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2903 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2904 } catch(char *ex) { 2905 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2906 } 2907 if (yy) { 2908 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2909 } else { 2910 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2911 } 2912 PetscFunctionReturn(0); 2913 } 2914 2915 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2916 { 2917 PetscErrorCode ierr; 2918 2919 PetscFunctionBegin; 2920 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2921 PetscFunctionReturn(0); 2922 } 2923 2924 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2925 { 2926 PetscErrorCode ierr; 2927 PetscSplitCSRDataStructure *d_mat = NULL; 2928 PetscFunctionBegin; 2929 if (A->factortype == MAT_FACTOR_NONE) { 2930 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2931 } 2932 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2933 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2934 if (d_mat) { 2935 A->offloadmask = PETSC_OFFLOAD_GPU; 2936 } 2937 2938 PetscFunctionReturn(0); 2939 } 2940 2941 /* --------------------------------------------------------------------------------*/ 2942 /*@ 2943 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2944 (the default parallel PETSc format). This matrix will ultimately pushed down 2945 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2946 assembly performance the user should preallocate the matrix storage by setting 2947 the parameter nz (or the array nnz). By setting these parameters accurately, 2948 performance during matrix assembly can be increased by more than a factor of 50. 2949 2950 Collective 2951 2952 Input Parameters: 2953 + comm - MPI communicator, set to PETSC_COMM_SELF 2954 . m - number of rows 2955 . n - number of columns 2956 . nz - number of nonzeros per row (same for all rows) 2957 - nnz - array containing the number of nonzeros in the various rows 2958 (possibly different for each row) or NULL 2959 2960 Output Parameter: 2961 . A - the matrix 2962 2963 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2964 MatXXXXSetPreallocation() paradgm instead of this routine directly. 2965 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 2966 2967 Notes: 2968 If nnz is given then nz is ignored 2969 2970 The AIJ format (also called the Yale sparse matrix format or 2971 compressed row storage), is fully compatible with standard Fortran 77 2972 storage. That is, the stored row and column indices can begin at 2973 either one (as in Fortran) or zero. See the users' manual for details. 2974 2975 Specify the preallocated storage with either nz or nnz (not both). 2976 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 2977 allocation. For large problems you MUST preallocate memory or you 2978 will get TERRIBLE performance, see the users' manual chapter on matrices. 2979 2980 By default, this format uses inodes (identical nodes) when possible, to 2981 improve numerical efficiency of matrix-vector products and solves. We 2982 search for consecutive rows with the same nonzero structure, thereby 2983 reusing matrix information to achieve increased efficiency. 2984 2985 Level: intermediate 2986 2987 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 2988 @*/ 2989 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 2990 { 2991 PetscErrorCode ierr; 2992 2993 PetscFunctionBegin; 2994 ierr = MatCreate(comm,A);CHKERRQ(ierr); 2995 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 2996 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2997 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 2998 PetscFunctionReturn(0); 2999 } 3000 3001 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3002 { 3003 PetscErrorCode ierr; 3004 PetscSplitCSRDataStructure *d_mat = NULL; 3005 3006 PetscFunctionBegin; 3007 if (A->factortype == MAT_FACTOR_NONE) { 3008 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 3009 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 3010 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3011 } else { 3012 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3013 } 3014 if (d_mat) { 3015 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3016 cudaError_t err; 3017 PetscSplitCSRDataStructure h_mat; 3018 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 3019 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 3020 if (a->compressedrow.use) { 3021 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 3022 } 3023 err = cudaFree(d_mat);CHKERRCUDA(err); 3024 } 3025 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3026 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3027 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3028 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3029 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3030 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3031 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3032 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3033 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3034 PetscFunctionReturn(0); 3035 } 3036 3037 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3038 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3039 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3040 { 3041 PetscErrorCode ierr; 3042 3043 PetscFunctionBegin; 3044 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3045 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3046 PetscFunctionReturn(0); 3047 } 3048 3049 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3050 { 3051 PetscErrorCode ierr; 3052 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3053 Mat_SeqAIJCUSPARSE *cy; 3054 Mat_SeqAIJCUSPARSE *cx; 3055 PetscScalar *ay; 3056 const PetscScalar *ax; 3057 CsrMatrix *csry,*csrx; 3058 cudaError_t cerr; 3059 3060 PetscFunctionBegin; 3061 if (X->ops->axpy != Y->ops->axpy) { 3062 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3063 PetscFunctionReturn(0); 3064 } 3065 /* if we are here, it means both matrices are bound to GPU */ 3066 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3067 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3068 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3069 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3070 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3071 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3072 csry = (CsrMatrix*)cy->mat->mat; 3073 csrx = (CsrMatrix*)cx->mat->mat; 3074 /* see if we can turn this into a cublas axpy */ 3075 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3076 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3077 if (eq) { 3078 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3079 } 3080 if (eq) str = SAME_NONZERO_PATTERN; 3081 } 3082 3083 if (str == SUBSET_NONZERO_PATTERN) { 3084 cusparseStatus_t stat; 3085 PetscScalar b = 1.0; 3086 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3087 size_t bufferSize; 3088 void *buffer; 3089 #endif 3090 3091 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3092 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3093 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3094 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3095 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3096 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3097 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3098 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3099 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3100 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3101 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3102 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3103 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3104 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3105 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3106 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3107 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3108 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3109 #else 3110 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3111 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3112 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3113 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3114 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3115 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3116 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3117 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3118 #endif 3119 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3120 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3121 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3122 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3123 } else if (str == SAME_NONZERO_PATTERN) { 3124 cublasHandle_t cublasv2handle; 3125 cublasStatus_t berr; 3126 PetscBLASInt one = 1, bnz = 1; 3127 3128 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3129 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3130 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3131 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3132 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3133 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3134 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3135 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3136 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3137 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3138 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3139 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3140 } else { 3141 ierr = MatAXPY_SeqAIJ(Y,a,X,DIFFERENT_NONZERO_PATTERN);CHKERRQ(ierr); 3142 } 3143 PetscFunctionReturn(0); 3144 } 3145 3146 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3147 { 3148 PetscErrorCode ierr; 3149 PetscBool both = PETSC_FALSE; 3150 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3151 3152 PetscFunctionBegin; 3153 if (A->factortype == MAT_FACTOR_NONE) { 3154 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3155 if (spptr->mat) { 3156 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3157 if (matrix->values) { 3158 both = PETSC_TRUE; 3159 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3160 } 3161 } 3162 if (spptr->matTranspose) { 3163 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3164 if (matrix->values) { 3165 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3166 } 3167 } 3168 } 3169 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3170 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3171 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3172 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3173 else A->offloadmask = PETSC_OFFLOAD_CPU; 3174 3175 PetscFunctionReturn(0); 3176 } 3177 3178 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3179 { 3180 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3181 PetscErrorCode ierr; 3182 3183 PetscFunctionBegin; 3184 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3185 if (flg) { 3186 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3187 3188 A->ops->axpy = MatAXPY_SeqAIJ; 3189 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3190 A->ops->mult = MatMult_SeqAIJ; 3191 A->ops->multadd = MatMultAdd_SeqAIJ; 3192 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3193 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3194 A->ops->multhermitiantranspose = NULL; 3195 A->ops->multhermitiantransposeadd = NULL; 3196 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3197 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3198 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3199 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3200 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3201 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3202 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3203 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3204 } else { 3205 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3206 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3207 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3208 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3209 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3210 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3211 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3212 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3213 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3214 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3215 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3216 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3217 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3218 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3219 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3220 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3221 } 3222 A->boundtocpu = flg; 3223 a->inode.use = flg; 3224 PetscFunctionReturn(0); 3225 } 3226 3227 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3228 { 3229 PetscErrorCode ierr; 3230 cusparseStatus_t stat; 3231 Mat B; 3232 3233 PetscFunctionBegin; 3234 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3235 if (reuse == MAT_INITIAL_MATRIX) { 3236 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3237 } else if (reuse == MAT_REUSE_MATRIX) { 3238 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3239 } 3240 B = *newmat; 3241 3242 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3243 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3244 3245 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3246 if (B->factortype == MAT_FACTOR_NONE) { 3247 Mat_SeqAIJCUSPARSE *spptr; 3248 3249 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3250 spptr->format = MAT_CUSPARSE_CSR; 3251 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3252 B->spptr = spptr; 3253 spptr->deviceMat = NULL; 3254 } else { 3255 Mat_SeqAIJCUSPARSETriFactors *spptr; 3256 3257 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3258 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3259 B->spptr = spptr; 3260 } 3261 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3262 } 3263 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3264 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3265 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3266 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3267 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3268 3269 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3270 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3271 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3272 PetscFunctionReturn(0); 3273 } 3274 3275 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3276 { 3277 PetscErrorCode ierr; 3278 3279 PetscFunctionBegin; 3280 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3281 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3282 ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr); 3283 ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr); 3284 ierr = PetscOptionsEnd();CHKERRQ(ierr); 3285 PetscFunctionReturn(0); 3286 } 3287 3288 /*MC 3289 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3290 3291 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3292 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3293 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3294 3295 Options Database Keys: 3296 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3297 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3298 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3299 3300 Level: beginner 3301 3302 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3303 M*/ 3304 3305 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 3306 3307 3308 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3309 { 3310 PetscErrorCode ierr; 3311 3312 PetscFunctionBegin; 3313 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3314 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3315 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3316 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3317 PetscFunctionReturn(0); 3318 } 3319 3320 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3321 { 3322 PetscErrorCode ierr; 3323 cusparseStatus_t stat; 3324 3325 PetscFunctionBegin; 3326 if (*cusparsestruct) { 3327 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3328 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3329 delete (*cusparsestruct)->workVector; 3330 delete (*cusparsestruct)->rowoffsets_gpu; 3331 delete (*cusparsestruct)->cooPerm; 3332 delete (*cusparsestruct)->cooPerm_a; 3333 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3334 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3335 cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr); 3336 #endif 3337 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3338 } 3339 PetscFunctionReturn(0); 3340 } 3341 3342 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3343 { 3344 PetscFunctionBegin; 3345 if (*mat) { 3346 delete (*mat)->values; 3347 delete (*mat)->column_indices; 3348 delete (*mat)->row_offsets; 3349 delete *mat; 3350 *mat = 0; 3351 } 3352 PetscFunctionReturn(0); 3353 } 3354 3355 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3356 { 3357 cusparseStatus_t stat; 3358 PetscErrorCode ierr; 3359 3360 PetscFunctionBegin; 3361 if (*trifactor) { 3362 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3363 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3364 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3365 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3366 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3367 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3368 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3369 #endif 3370 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3371 } 3372 PetscFunctionReturn(0); 3373 } 3374 3375 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3376 { 3377 CsrMatrix *mat; 3378 cusparseStatus_t stat; 3379 cudaError_t err; 3380 3381 PetscFunctionBegin; 3382 if (*matstruct) { 3383 if ((*matstruct)->mat) { 3384 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3385 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3386 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3387 #else 3388 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3389 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3390 #endif 3391 } else { 3392 mat = (CsrMatrix*)(*matstruct)->mat; 3393 CsrMatrix_Destroy(&mat); 3394 } 3395 } 3396 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3397 delete (*matstruct)->cprowIndices; 3398 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3399 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3400 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3401 3402 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3403 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3404 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3405 for (int i=0; i<3; i++) { 3406 if (mdata->cuSpMV[i].initialized) { 3407 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3408 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3409 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3410 } 3411 } 3412 #endif 3413 delete *matstruct; 3414 *matstruct = NULL; 3415 } 3416 PetscFunctionReturn(0); 3417 } 3418 3419 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3420 { 3421 PetscErrorCode ierr; 3422 3423 PetscFunctionBegin; 3424 if (*trifactors) { 3425 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3426 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3427 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3428 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3429 delete (*trifactors)->rpermIndices; 3430 delete (*trifactors)->cpermIndices; 3431 delete (*trifactors)->workVector; 3432 (*trifactors)->rpermIndices = NULL; 3433 (*trifactors)->cpermIndices = NULL; 3434 (*trifactors)->workVector = NULL; 3435 } 3436 PetscFunctionReturn(0); 3437 } 3438 3439 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3440 { 3441 PetscErrorCode ierr; 3442 cusparseHandle_t handle; 3443 cusparseStatus_t stat; 3444 3445 PetscFunctionBegin; 3446 if (*trifactors) { 3447 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3448 if (handle = (*trifactors)->handle) { 3449 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3450 } 3451 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3452 } 3453 PetscFunctionReturn(0); 3454 } 3455 3456 struct IJCompare 3457 { 3458 __host__ __device__ 3459 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3460 { 3461 if (t1.get<0>() < t2.get<0>()) return true; 3462 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3463 return false; 3464 } 3465 }; 3466 3467 struct IJEqual 3468 { 3469 __host__ __device__ 3470 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3471 { 3472 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3473 return true; 3474 } 3475 }; 3476 3477 struct IJDiff 3478 { 3479 __host__ __device__ 3480 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3481 { 3482 return t1 == t2 ? 0 : 1; 3483 } 3484 }; 3485 3486 struct IJSum 3487 { 3488 __host__ __device__ 3489 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3490 { 3491 return t1||t2; 3492 } 3493 }; 3494 3495 #include <thrust/iterator/discard_iterator.h> 3496 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3497 { 3498 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3499 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3500 THRUSTARRAY *cooPerm_v = NULL; 3501 thrust::device_ptr<const PetscScalar> d_v; 3502 CsrMatrix *matrix; 3503 PetscErrorCode ierr; 3504 cudaError_t cerr; 3505 PetscInt n; 3506 3507 PetscFunctionBegin; 3508 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3509 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3510 if (!cusp->cooPerm) { 3511 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3512 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3513 PetscFunctionReturn(0); 3514 } 3515 matrix = (CsrMatrix*)cusp->mat->mat; 3516 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3517 if (!v) { 3518 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3519 goto finalize; 3520 } 3521 n = cusp->cooPerm->size(); 3522 if (isCudaMem(v)) { 3523 d_v = thrust::device_pointer_cast(v); 3524 } else { 3525 cooPerm_v = new THRUSTARRAY(n); 3526 cooPerm_v->assign(v,v+n); 3527 d_v = cooPerm_v->data(); 3528 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3529 } 3530 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3531 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3532 if (cusp->cooPerm_a) { 3533 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3534 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3535 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3536 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3537 delete cooPerm_w; 3538 } else { 3539 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3540 matrix->values->begin())); 3541 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3542 matrix->values->end())); 3543 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 3544 } 3545 } else { 3546 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3547 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3548 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3549 } else { 3550 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3551 matrix->values->begin())); 3552 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3553 matrix->values->end())); 3554 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3555 } 3556 } 3557 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3558 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3559 finalize: 3560 delete cooPerm_v; 3561 A->offloadmask = PETSC_OFFLOAD_GPU; 3562 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3563 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3564 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3565 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3566 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3567 a->reallocs = 0; 3568 A->info.mallocs += 0; 3569 A->info.nz_unneeded = 0; 3570 A->assembled = A->was_assembled = PETSC_TRUE; 3571 A->num_ass++; 3572 PetscFunctionReturn(0); 3573 } 3574 3575 #include <thrust/binary_search.h> 3576 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3577 { 3578 PetscErrorCode ierr; 3579 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3580 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3581 PetscInt cooPerm_n, nzr = 0; 3582 cudaError_t cerr; 3583 3584 PetscFunctionBegin; 3585 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3586 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3587 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3588 if (n != cooPerm_n) { 3589 delete cusp->cooPerm; 3590 delete cusp->cooPerm_a; 3591 cusp->cooPerm = NULL; 3592 cusp->cooPerm_a = NULL; 3593 } 3594 if (n) { 3595 THRUSTINTARRAY d_i(n); 3596 THRUSTINTARRAY d_j(n); 3597 THRUSTINTARRAY ii(A->rmap->n); 3598 3599 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3600 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3601 3602 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3603 d_i.assign(coo_i,coo_i+n); 3604 d_j.assign(coo_j,coo_j+n); 3605 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3606 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3607 3608 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3609 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3610 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 3611 *cusp->cooPerm_a = d_i; 3612 THRUSTINTARRAY w = d_j; 3613 3614 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 3615 if (nekey == ekey) { /* all entries are unique */ 3616 delete cusp->cooPerm_a; 3617 cusp->cooPerm_a = NULL; 3618 } else { /* I couldn't come up with a more elegant algorithm */ 3619 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 3620 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 3621 (*cusp->cooPerm_a)[0] = 0; 3622 w[0] = 0; 3623 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 3624 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 3625 } 3626 thrust::counting_iterator<PetscInt> search_begin(0); 3627 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 3628 search_begin, search_begin + A->rmap->n, 3629 ii.begin()); 3630 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3631 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3632 3633 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3634 a->singlemalloc = PETSC_FALSE; 3635 a->free_a = PETSC_TRUE; 3636 a->free_ij = PETSC_TRUE; 3637 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3638 a->i[0] = 0; 3639 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3640 a->nz = a->maxnz = a->i[A->rmap->n]; 3641 a->rmax = 0; 3642 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3643 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3644 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3645 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3646 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3647 for (PetscInt i = 0; i < A->rmap->n; i++) { 3648 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3649 nzr += (PetscInt)!!(nnzr); 3650 a->ilen[i] = a->imax[i] = nnzr; 3651 a->rmax = PetscMax(a->rmax,nnzr); 3652 } 3653 a->nonzerorowcnt = nzr; 3654 A->preallocated = PETSC_TRUE; 3655 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3656 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3657 } else { 3658 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3659 } 3660 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3661 3662 /* We want to allocate the CUSPARSE struct for matvec now. 3663 The code is so convoluted now that I prefer to copy zeros */ 3664 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3665 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3666 A->offloadmask = PETSC_OFFLOAD_CPU; 3667 A->nonzerostate++; 3668 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3669 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3670 3671 A->assembled = PETSC_FALSE; 3672 A->was_assembled = PETSC_FALSE; 3673 PetscFunctionReturn(0); 3674 } 3675 3676 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 3677 { 3678 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3679 CsrMatrix *csr; 3680 PetscErrorCode ierr; 3681 3682 PetscFunctionBegin; 3683 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3684 PetscValidPointer(a,2); 3685 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3686 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3687 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3688 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3689 csr = (CsrMatrix*)cusp->mat->mat; 3690 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3691 *a = csr->values->data().get(); 3692 PetscFunctionReturn(0); 3693 } 3694 3695 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 3696 { 3697 PetscFunctionBegin; 3698 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3699 PetscValidPointer(a,2); 3700 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3701 *a = NULL; 3702 PetscFunctionReturn(0); 3703 } 3704 3705 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 3706 { 3707 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3708 CsrMatrix *csr; 3709 PetscErrorCode ierr; 3710 3711 PetscFunctionBegin; 3712 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3713 PetscValidPointer(a,2); 3714 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3715 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3716 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3717 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3718 csr = (CsrMatrix*)cusp->mat->mat; 3719 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3720 *a = csr->values->data().get(); 3721 A->offloadmask = PETSC_OFFLOAD_GPU; 3722 PetscFunctionReturn(0); 3723 } 3724 3725 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 3726 { 3727 PetscErrorCode ierr; 3728 3729 PetscFunctionBegin; 3730 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3731 PetscValidPointer(a,2); 3732 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3733 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3734 *a = NULL; 3735 PetscFunctionReturn(0); 3736 } 3737 3738 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 3739 { 3740 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3741 CsrMatrix *csr; 3742 3743 PetscFunctionBegin; 3744 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3745 PetscValidPointer(a,2); 3746 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3747 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3748 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3749 csr = (CsrMatrix*)cusp->mat->mat; 3750 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3751 *a = csr->values->data().get(); 3752 A->offloadmask = PETSC_OFFLOAD_GPU; 3753 PetscFunctionReturn(0); 3754 } 3755 3756 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 3757 { 3758 PetscErrorCode ierr; 3759 3760 PetscFunctionBegin; 3761 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3762 PetscValidPointer(a,2); 3763 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3764 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3765 *a = NULL; 3766 PetscFunctionReturn(0); 3767 } 3768 3769 struct IJCompare4 3770 { 3771 __host__ __device__ 3772 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 3773 { 3774 if (t1.get<0>() < t2.get<0>()) return true; 3775 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3776 return false; 3777 } 3778 }; 3779 3780 struct Shift 3781 { 3782 int _shift; 3783 3784 Shift(int shift) : _shift(shift) {} 3785 __host__ __device__ 3786 inline int operator() (const int &c) 3787 { 3788 return c + _shift; 3789 } 3790 }; 3791 3792 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */ 3793 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 3794 { 3795 PetscErrorCode ierr; 3796 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 3797 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 3798 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 3799 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3800 PetscInt Annz,Bnnz; 3801 cusparseStatus_t stat; 3802 PetscInt i,m,n,zero = 0; 3803 cudaError_t cerr; 3804 3805 PetscFunctionBegin; 3806 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3807 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 3808 PetscValidPointer(C,4); 3809 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3810 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 3811 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 3812 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 3813 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3814 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3815 if (reuse == MAT_INITIAL_MATRIX) { 3816 m = A->rmap->n; 3817 n = A->cmap->n + B->cmap->n; 3818 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 3819 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 3820 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3821 c = (Mat_SeqAIJ*)(*C)->data; 3822 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3823 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3824 Ccsr = new CsrMatrix; 3825 Cmat->cprowIndices = NULL; 3826 c->compressedrow.use = PETSC_FALSE; 3827 c->compressedrow.nrows = 0; 3828 c->compressedrow.i = NULL; 3829 c->compressedrow.rindex = NULL; 3830 Ccusp->workVector = NULL; 3831 Ccusp->nrows = m; 3832 Ccusp->mat = Cmat; 3833 Ccusp->mat->mat = Ccsr; 3834 Ccsr->num_rows = m; 3835 Ccsr->num_cols = n; 3836 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 3837 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3838 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3839 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3840 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3841 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3842 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3843 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3844 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3845 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3846 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 3847 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 3848 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr); 3849 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3850 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3851 3852 Acsr = (CsrMatrix*)Acusp->mat->mat; 3853 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 3854 Annz = (PetscInt)Acsr->column_indices->size(); 3855 Bnnz = (PetscInt)Bcsr->column_indices->size(); 3856 c->nz = Annz + Bnnz; 3857 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 3858 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3859 Ccsr->values = new THRUSTARRAY(c->nz); 3860 Ccsr->num_entries = c->nz; 3861 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 3862 if (c->nz) { 3863 auto Acoo = new THRUSTINTARRAY32(Annz); 3864 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 3865 auto Ccoo = new THRUSTINTARRAY32(c->nz); 3866 THRUSTINTARRAY32 *Aroff,*Broff; 3867 3868 if (a->compressedrow.use) { /* need full row offset */ 3869 if (!Acusp->rowoffsets_gpu) { 3870 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 3871 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 3872 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3873 } 3874 Aroff = Acusp->rowoffsets_gpu; 3875 } else Aroff = Acsr->row_offsets; 3876 if (b->compressedrow.use) { /* need full row offset */ 3877 if (!Bcusp->rowoffsets_gpu) { 3878 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3879 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 3880 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3881 } 3882 Broff = Bcusp->rowoffsets_gpu; 3883 } else Broff = Bcsr->row_offsets; 3884 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3885 stat = cusparseXcsr2coo(Acusp->handle, 3886 Aroff->data().get(), 3887 Annz, 3888 m, 3889 Acoo->data().get(), 3890 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3891 stat = cusparseXcsr2coo(Bcusp->handle, 3892 Broff->data().get(), 3893 Bnnz, 3894 m, 3895 Bcoo->data().get(), 3896 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3897 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 3898 auto Aperm = thrust::make_constant_iterator(1); 3899 auto Bperm = thrust::make_constant_iterator(0); 3900 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 3901 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 3902 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 3903 #else 3904 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 3905 auto Bcib = Bcsr->column_indices->begin(); 3906 auto Bcie = Bcsr->column_indices->end(); 3907 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 3908 #endif 3909 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 3910 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 3911 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 3912 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 3913 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 3914 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 3915 auto p1 = Ccusp->cooPerm->begin(); 3916 auto p2 = Ccusp->cooPerm->begin(); 3917 thrust::advance(p2,Annz); 3918 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 3919 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 3920 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 3921 #endif 3922 auto cci = thrust::make_counting_iterator(zero); 3923 auto cce = thrust::make_counting_iterator(c->nz); 3924 #if 0 //Errors on SUMMIT cuda 11.1.0 3925 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 3926 #else 3927 auto pred = thrust::identity<int>(); 3928 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 3929 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 3930 #endif 3931 stat = cusparseXcoo2csr(Ccusp->handle, 3932 Ccoo->data().get(), 3933 c->nz, 3934 m, 3935 Ccsr->row_offsets->data().get(), 3936 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3937 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3938 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3939 delete wPerm; 3940 delete Acoo; 3941 delete Bcoo; 3942 delete Ccoo; 3943 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3944 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 3945 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 3946 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 3947 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 3948 #endif 3949 if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */ 3950 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 3951 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 3952 CsrMatrix *CcsrT = new CsrMatrix; 3953 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 3954 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 3955 3956 Ccusp->transgen = PETSC_TRUE; 3957 CmatT->cprowIndices = NULL; 3958 CmatT->mat = CcsrT; 3959 CcsrT->num_rows = n; 3960 CcsrT->num_cols = m; 3961 CcsrT->num_entries = c->nz; 3962 3963 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 3964 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 3965 CcsrT->values = new THRUSTARRAY(c->nz); 3966 3967 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3968 auto rT = CcsrT->row_offsets->begin(); 3969 if (AT) { 3970 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 3971 thrust::advance(rT,-1); 3972 } 3973 if (BT) { 3974 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 3975 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 3976 thrust::copy(titb,tite,rT); 3977 } 3978 auto cT = CcsrT->column_indices->begin(); 3979 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 3980 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 3981 auto vT = CcsrT->values->begin(); 3982 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 3983 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 3984 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3985 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3986 3987 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 3988 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3989 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3990 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3991 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3992 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3993 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3994 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3995 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3996 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3997 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 3998 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 3999 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4000 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4001 #endif 4002 Ccusp->matTranspose = CmatT; 4003 } 4004 } 4005 4006 c->singlemalloc = PETSC_FALSE; 4007 c->free_a = PETSC_TRUE; 4008 c->free_ij = PETSC_TRUE; 4009 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4010 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4011 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4012 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4013 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4014 ii = *Ccsr->row_offsets; 4015 jj = *Ccsr->column_indices; 4016 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4017 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4018 } else { 4019 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4020 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4021 } 4022 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4023 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4024 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4025 c->maxnz = c->nz; 4026 c->nonzerorowcnt = 0; 4027 c->rmax = 0; 4028 for (i = 0; i < m; i++) { 4029 const PetscInt nn = c->i[i+1] - c->i[i]; 4030 c->ilen[i] = c->imax[i] = nn; 4031 c->nonzerorowcnt += (PetscInt)!!nn; 4032 c->rmax = PetscMax(c->rmax,nn); 4033 } 4034 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4035 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4036 (*C)->nonzerostate++; 4037 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4038 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4039 Ccusp->nonzerostate = (*C)->nonzerostate; 4040 (*C)->preallocated = PETSC_TRUE; 4041 } else { 4042 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4043 c = (Mat_SeqAIJ*)(*C)->data; 4044 if (c->nz) { 4045 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4046 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4047 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4048 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4049 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4050 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4051 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4052 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4053 Acsr = (CsrMatrix*)Acusp->mat->mat; 4054 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4055 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4056 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4057 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4058 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4059 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4060 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4061 auto pmid = Ccusp->cooPerm->begin(); 4062 thrust::advance(pmid,Acsr->num_entries); 4063 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4064 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4065 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4066 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4067 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4068 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4069 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4070 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4071 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4072 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4073 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4074 if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) { 4075 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4076 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4077 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4078 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4079 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4080 auto vT = CcsrT->values->begin(); 4081 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4082 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4083 } 4084 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4085 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4086 } 4087 } 4088 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4089 (*C)->assembled = PETSC_TRUE; 4090 (*C)->was_assembled = PETSC_FALSE; 4091 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4092 PetscFunctionReturn(0); 4093 } 4094 4095 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4096 { 4097 PetscErrorCode ierr; 4098 bool dmem; 4099 const PetscScalar *av; 4100 cudaError_t cerr; 4101 4102 PetscFunctionBegin; 4103 dmem = isCudaMem(v); 4104 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4105 if (n && idx) { 4106 THRUSTINTARRAY widx(n); 4107 widx.assign(idx,idx+n); 4108 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4109 4110 THRUSTARRAY *w = NULL; 4111 thrust::device_ptr<PetscScalar> dv; 4112 if (dmem) { 4113 dv = thrust::device_pointer_cast(v); 4114 } else { 4115 w = new THRUSTARRAY(n); 4116 dv = w->data(); 4117 } 4118 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4119 4120 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4121 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4122 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4123 if (w) { 4124 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4125 } 4126 delete w; 4127 } else { 4128 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4129 } 4130 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4131 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4132 PetscFunctionReturn(0); 4133 } 4134