1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 68 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 69 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 70 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 71 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 72 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 73 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 74 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 75 76 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 82 83 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 85 86 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 87 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 88 89 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 90 91 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 92 { 93 cusparseStatus_t stat; 94 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 95 96 PetscFunctionBegin; 97 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 98 cusparsestruct->stream = stream; 99 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 100 PetscFunctionReturn(0); 101 } 102 103 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 104 { 105 cusparseStatus_t stat; 106 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 107 108 PetscFunctionBegin; 109 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 110 if (cusparsestruct->handle != handle) { 111 if (cusparsestruct->handle) { 112 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 113 } 114 cusparsestruct->handle = handle; 115 } 116 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 117 PetscFunctionReturn(0); 118 } 119 120 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 121 { 122 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 123 PetscBool flg; 124 PetscErrorCode ierr; 125 126 PetscFunctionBegin; 127 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 128 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 129 if (cusparsestruct->handle) cusparsestruct->handle = 0; 130 PetscFunctionReturn(0); 131 } 132 133 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 134 { 135 PetscFunctionBegin; 136 *type = MATSOLVERCUSPARSE; 137 PetscFunctionReturn(0); 138 } 139 140 /*MC 141 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 142 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 143 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 144 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 145 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 146 algorithms are not recommended. This class does NOT support direct solver operations. 147 148 Level: beginner 149 150 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 151 M*/ 152 153 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 154 { 155 PetscErrorCode ierr; 156 PetscInt n = A->rmap->n; 157 158 PetscFunctionBegin; 159 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 160 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 161 (*B)->factortype = ftype; 162 (*B)->useordering = PETSC_TRUE; 163 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 164 165 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 166 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 167 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 168 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 169 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 170 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 171 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 172 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 173 174 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 175 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 176 PetscFunctionReturn(0); 177 } 178 179 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 180 { 181 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 182 183 PetscFunctionBegin; 184 switch (op) { 185 case MAT_CUSPARSE_MULT: 186 cusparsestruct->format = format; 187 break; 188 case MAT_CUSPARSE_ALL: 189 cusparsestruct->format = format; 190 break; 191 default: 192 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 193 } 194 PetscFunctionReturn(0); 195 } 196 197 /*@ 198 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 199 operation. Only the MatMult operation can use different GPU storage formats 200 for MPIAIJCUSPARSE matrices. 201 Not Collective 202 203 Input Parameters: 204 + A - Matrix of type SEQAIJCUSPARSE 205 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 206 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 207 208 Output Parameter: 209 210 Level: intermediate 211 212 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 213 @*/ 214 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 215 { 216 PetscErrorCode ierr; 217 218 PetscFunctionBegin; 219 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 220 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 221 PetscFunctionReturn(0); 222 } 223 224 /*@ 225 MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose 226 227 Collective on mat 228 229 Input Parameters: 230 + A - Matrix of type SEQAIJCUSPARSE 231 - transgen - the boolean flag 232 233 Level: intermediate 234 235 .seealso: MATSEQAIJCUSPARSE, MatAIJCUSPARSESetGenerateTranspose() 236 @*/ 237 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 238 { 239 PetscErrorCode ierr; 240 PetscBool flg; 241 242 PetscFunctionBegin; 243 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 244 ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 245 if (flg) { 246 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 247 248 if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 249 cusp->transgen = transgen; 250 if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 251 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 252 } 253 } 254 PetscFunctionReturn(0); 255 } 256 257 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 258 { 259 PetscErrorCode ierr; 260 MatCUSPARSEStorageFormat format; 261 PetscBool flg; 262 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 263 264 PetscFunctionBegin; 265 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 266 if (A->factortype == MAT_FACTOR_NONE) { 267 PetscBool transgen = cusparsestruct->transgen; 268 269 ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 270 if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);} 271 272 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 273 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 274 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 275 276 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 277 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 278 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 279 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 280 cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 281 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 282 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 283 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 284 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 285 286 cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 287 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 288 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 289 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 290 291 cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 292 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 293 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 294 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 295 #endif 296 } 297 ierr = PetscOptionsTail();CHKERRQ(ierr); 298 PetscFunctionReturn(0); 299 } 300 301 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 302 { 303 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 304 PetscErrorCode ierr; 305 306 PetscFunctionBegin; 307 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 308 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 309 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 310 PetscFunctionReturn(0); 311 } 312 313 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 314 { 315 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 316 PetscErrorCode ierr; 317 318 PetscFunctionBegin; 319 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 320 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 321 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 322 PetscFunctionReturn(0); 323 } 324 325 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 326 { 327 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 328 PetscErrorCode ierr; 329 330 PetscFunctionBegin; 331 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 332 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 333 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 334 PetscFunctionReturn(0); 335 } 336 337 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 338 { 339 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 340 PetscErrorCode ierr; 341 342 PetscFunctionBegin; 343 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 344 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 345 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 346 PetscFunctionReturn(0); 347 } 348 349 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 350 { 351 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 352 PetscInt n = A->rmap->n; 353 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 354 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 355 cusparseStatus_t stat; 356 const PetscInt *ai = a->i,*aj = a->j,*vi; 357 const MatScalar *aa = a->a,*v; 358 PetscInt *AiLo, *AjLo; 359 PetscInt i,nz, nzLower, offset, rowOffset; 360 PetscErrorCode ierr; 361 cudaError_t cerr; 362 363 PetscFunctionBegin; 364 if (!n) PetscFunctionReturn(0); 365 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 366 try { 367 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 368 nzLower=n+ai[n]-ai[1]; 369 if (!loTriFactor) { 370 PetscScalar *AALo; 371 372 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 373 374 /* Allocate Space for the lower triangular matrix */ 375 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 376 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 377 378 /* Fill the lower triangular matrix */ 379 AiLo[0] = (PetscInt) 0; 380 AiLo[n] = nzLower; 381 AjLo[0] = (PetscInt) 0; 382 AALo[0] = (MatScalar) 1.0; 383 v = aa; 384 vi = aj; 385 offset = 1; 386 rowOffset= 1; 387 for (i=1; i<n; i++) { 388 nz = ai[i+1] - ai[i]; 389 /* additional 1 for the term on the diagonal */ 390 AiLo[i] = rowOffset; 391 rowOffset += nz+1; 392 393 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 394 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 395 396 offset += nz; 397 AjLo[offset] = (PetscInt) i; 398 AALo[offset] = (MatScalar) 1.0; 399 offset += 1; 400 401 v += nz; 402 vi += nz; 403 } 404 405 /* allocate space for the triangular factor information */ 406 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 407 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 408 /* Create the matrix description */ 409 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 410 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 411 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 412 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 413 #else 414 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 415 #endif 416 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 417 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 418 419 /* set the operation */ 420 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 421 422 /* set the matrix */ 423 loTriFactor->csrMat = new CsrMatrix; 424 loTriFactor->csrMat->num_rows = n; 425 loTriFactor->csrMat->num_cols = n; 426 loTriFactor->csrMat->num_entries = nzLower; 427 428 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 429 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 430 431 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 432 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 433 434 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 435 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 436 437 /* Create the solve analysis information */ 438 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 439 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 440 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 441 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 442 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 443 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 444 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 445 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 446 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 447 #endif 448 449 /* perform the solve analysis */ 450 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 451 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 452 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 453 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 454 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 455 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 456 #endif 457 );CHKERRCUSPARSE(stat); 458 cerr = WaitForCUDA();CHKERRCUDA(cerr); 459 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 460 461 /* assign the pointer */ 462 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 463 loTriFactor->AA_h = AALo; 464 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 465 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 466 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 467 } else { /* update values only */ 468 if (!loTriFactor->AA_h) { 469 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 470 } 471 /* Fill the lower triangular matrix */ 472 loTriFactor->AA_h[0] = 1.0; 473 v = aa; 474 vi = aj; 475 offset = 1; 476 for (i=1; i<n; i++) { 477 nz = ai[i+1] - ai[i]; 478 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 479 offset += nz; 480 loTriFactor->AA_h[offset] = 1.0; 481 offset += 1; 482 v += nz; 483 } 484 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 485 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 486 } 487 } catch(char *ex) { 488 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 489 } 490 } 491 PetscFunctionReturn(0); 492 } 493 494 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 495 { 496 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 497 PetscInt n = A->rmap->n; 498 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 499 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 500 cusparseStatus_t stat; 501 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 502 const MatScalar *aa = a->a,*v; 503 PetscInt *AiUp, *AjUp; 504 PetscInt i,nz, nzUpper, offset; 505 PetscErrorCode ierr; 506 cudaError_t cerr; 507 508 PetscFunctionBegin; 509 if (!n) PetscFunctionReturn(0); 510 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 511 try { 512 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 513 nzUpper = adiag[0]-adiag[n]; 514 if (!upTriFactor) { 515 PetscScalar *AAUp; 516 517 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 518 519 /* Allocate Space for the upper triangular matrix */ 520 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 521 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 522 523 /* Fill the upper triangular matrix */ 524 AiUp[0]=(PetscInt) 0; 525 AiUp[n]=nzUpper; 526 offset = nzUpper; 527 for (i=n-1; i>=0; i--) { 528 v = aa + adiag[i+1] + 1; 529 vi = aj + adiag[i+1] + 1; 530 531 /* number of elements NOT on the diagonal */ 532 nz = adiag[i] - adiag[i+1]-1; 533 534 /* decrement the offset */ 535 offset -= (nz+1); 536 537 /* first, set the diagonal elements */ 538 AjUp[offset] = (PetscInt) i; 539 AAUp[offset] = (MatScalar)1./v[nz]; 540 AiUp[i] = AiUp[i+1] - (nz+1); 541 542 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 543 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 544 } 545 546 /* allocate space for the triangular factor information */ 547 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 548 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 549 550 /* Create the matrix description */ 551 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 552 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 553 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 554 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 555 #else 556 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 557 #endif 558 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 559 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 560 561 /* set the operation */ 562 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 563 564 /* set the matrix */ 565 upTriFactor->csrMat = new CsrMatrix; 566 upTriFactor->csrMat->num_rows = n; 567 upTriFactor->csrMat->num_cols = n; 568 upTriFactor->csrMat->num_entries = nzUpper; 569 570 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 571 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 572 573 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 574 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 575 576 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 577 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 578 579 /* Create the solve analysis information */ 580 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 581 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 582 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 583 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 584 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 585 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 586 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 587 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 588 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 589 #endif 590 591 /* perform the solve analysis */ 592 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 593 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 594 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 595 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 596 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 597 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 598 #endif 599 );CHKERRCUSPARSE(stat); 600 cerr = WaitForCUDA();CHKERRCUDA(cerr); 601 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 602 603 /* assign the pointer */ 604 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 605 upTriFactor->AA_h = AAUp; 606 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 607 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 608 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 609 } else { 610 if (!upTriFactor->AA_h) { 611 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 612 } 613 /* Fill the upper triangular matrix */ 614 offset = nzUpper; 615 for (i=n-1; i>=0; i--) { 616 v = aa + adiag[i+1] + 1; 617 618 /* number of elements NOT on the diagonal */ 619 nz = adiag[i] - adiag[i+1]-1; 620 621 /* decrement the offset */ 622 offset -= (nz+1); 623 624 /* first, set the diagonal elements */ 625 upTriFactor->AA_h[offset] = 1./v[nz]; 626 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 627 } 628 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 629 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 630 } 631 } catch(char *ex) { 632 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 633 } 634 } 635 PetscFunctionReturn(0); 636 } 637 638 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 639 { 640 PetscErrorCode ierr; 641 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 642 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 643 IS isrow = a->row,iscol = a->icol; 644 PetscBool row_identity,col_identity; 645 PetscInt n = A->rmap->n; 646 647 PetscFunctionBegin; 648 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 649 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 650 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 651 652 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 653 cusparseTriFactors->nnz=a->nz; 654 655 A->offloadmask = PETSC_OFFLOAD_BOTH; 656 /* lower triangular indices */ 657 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 658 if (!row_identity && !cusparseTriFactors->rpermIndices) { 659 const PetscInt *r; 660 661 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 662 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 663 cusparseTriFactors->rpermIndices->assign(r, r+n); 664 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 665 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 666 } 667 668 /* upper triangular indices */ 669 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 670 if (!col_identity && !cusparseTriFactors->cpermIndices) { 671 const PetscInt *c; 672 673 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 674 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 675 cusparseTriFactors->cpermIndices->assign(c, c+n); 676 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 677 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 678 } 679 PetscFunctionReturn(0); 680 } 681 682 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 683 { 684 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 685 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 686 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 687 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 688 cusparseStatus_t stat; 689 PetscErrorCode ierr; 690 cudaError_t cerr; 691 PetscInt *AiUp, *AjUp; 692 PetscScalar *AAUp; 693 PetscScalar *AALo; 694 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 695 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 696 const PetscInt *ai = b->i,*aj = b->j,*vj; 697 const MatScalar *aa = b->a,*v; 698 699 PetscFunctionBegin; 700 if (!n) PetscFunctionReturn(0); 701 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 702 try { 703 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 704 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 705 if (!upTriFactor && !loTriFactor) { 706 /* Allocate Space for the upper triangular matrix */ 707 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 708 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 709 710 /* Fill the upper triangular matrix */ 711 AiUp[0]=(PetscInt) 0; 712 AiUp[n]=nzUpper; 713 offset = 0; 714 for (i=0; i<n; i++) { 715 /* set the pointers */ 716 v = aa + ai[i]; 717 vj = aj + ai[i]; 718 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 719 720 /* first, set the diagonal elements */ 721 AjUp[offset] = (PetscInt) i; 722 AAUp[offset] = (MatScalar)1.0/v[nz]; 723 AiUp[i] = offset; 724 AALo[offset] = (MatScalar)1.0/v[nz]; 725 726 offset+=1; 727 if (nz>0) { 728 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 729 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 730 for (j=offset; j<offset+nz; j++) { 731 AAUp[j] = -AAUp[j]; 732 AALo[j] = AAUp[j]/v[nz]; 733 } 734 offset+=nz; 735 } 736 } 737 738 /* allocate space for the triangular factor information */ 739 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 740 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 741 742 /* Create the matrix description */ 743 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 744 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 745 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 746 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 747 #else 748 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 749 #endif 750 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 751 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 752 753 /* set the matrix */ 754 upTriFactor->csrMat = new CsrMatrix; 755 upTriFactor->csrMat->num_rows = A->rmap->n; 756 upTriFactor->csrMat->num_cols = A->cmap->n; 757 upTriFactor->csrMat->num_entries = a->nz; 758 759 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 760 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 761 762 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 763 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 764 765 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 766 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 767 768 /* set the operation */ 769 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 770 771 /* Create the solve analysis information */ 772 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 773 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 774 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 775 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 776 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 777 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 778 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 779 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 780 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 781 #endif 782 783 /* perform the solve analysis */ 784 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 785 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 786 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 787 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 788 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 789 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 790 #endif 791 );CHKERRCUSPARSE(stat); 792 cerr = WaitForCUDA();CHKERRCUDA(cerr); 793 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 794 795 /* assign the pointer */ 796 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 797 798 /* allocate space for the triangular factor information */ 799 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 800 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 801 802 /* Create the matrix description */ 803 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 804 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 805 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 806 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 807 #else 808 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 809 #endif 810 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 811 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 812 813 /* set the operation */ 814 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 815 816 /* set the matrix */ 817 loTriFactor->csrMat = new CsrMatrix; 818 loTriFactor->csrMat->num_rows = A->rmap->n; 819 loTriFactor->csrMat->num_cols = A->cmap->n; 820 loTriFactor->csrMat->num_entries = a->nz; 821 822 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 823 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 824 825 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 826 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 827 828 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 829 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 830 831 /* Create the solve analysis information */ 832 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 833 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 834 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 835 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 836 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 837 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 838 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 839 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 840 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 841 #endif 842 843 /* perform the solve analysis */ 844 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 845 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 846 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 847 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 848 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 849 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 850 #endif 851 );CHKERRCUSPARSE(stat); 852 cerr = WaitForCUDA();CHKERRCUDA(cerr); 853 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 854 855 /* assign the pointer */ 856 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 857 858 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 859 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 860 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 861 } else { 862 /* Fill the upper triangular matrix */ 863 offset = 0; 864 for (i=0; i<n; i++) { 865 /* set the pointers */ 866 v = aa + ai[i]; 867 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 868 869 /* first, set the diagonal elements */ 870 AAUp[offset] = 1.0/v[nz]; 871 AALo[offset] = 1.0/v[nz]; 872 873 offset+=1; 874 if (nz>0) { 875 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 876 for (j=offset; j<offset+nz; j++) { 877 AAUp[j] = -AAUp[j]; 878 AALo[j] = AAUp[j]/v[nz]; 879 } 880 offset+=nz; 881 } 882 } 883 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 884 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 885 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 886 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 887 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 888 } 889 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 890 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 891 } catch(char *ex) { 892 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 893 } 894 } 895 PetscFunctionReturn(0); 896 } 897 898 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 899 { 900 PetscErrorCode ierr; 901 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 902 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 903 IS ip = a->row; 904 PetscBool perm_identity; 905 PetscInt n = A->rmap->n; 906 907 PetscFunctionBegin; 908 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 909 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 910 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 911 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 912 913 A->offloadmask = PETSC_OFFLOAD_BOTH; 914 915 /* lower triangular indices */ 916 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 917 if (!perm_identity) { 918 IS iip; 919 const PetscInt *irip,*rip; 920 921 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 922 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 923 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 924 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 925 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 926 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 927 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 928 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 929 ierr = ISDestroy(&iip);CHKERRQ(ierr); 930 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 931 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 932 } 933 PetscFunctionReturn(0); 934 } 935 936 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 937 { 938 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 939 IS isrow = b->row,iscol = b->col; 940 PetscBool row_identity,col_identity; 941 PetscErrorCode ierr; 942 943 PetscFunctionBegin; 944 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 945 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 946 B->offloadmask = PETSC_OFFLOAD_CPU; 947 /* determine which version of MatSolve needs to be used. */ 948 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 949 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 950 if (row_identity && col_identity) { 951 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 952 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 953 B->ops->matsolve = NULL; 954 B->ops->matsolvetranspose = NULL; 955 } else { 956 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 957 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 958 B->ops->matsolve = NULL; 959 B->ops->matsolvetranspose = NULL; 960 } 961 962 /* get the triangular factors */ 963 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 964 PetscFunctionReturn(0); 965 } 966 967 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 968 { 969 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 970 IS ip = b->row; 971 PetscBool perm_identity; 972 PetscErrorCode ierr; 973 974 PetscFunctionBegin; 975 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 976 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 977 B->offloadmask = PETSC_OFFLOAD_CPU; 978 /* determine which version of MatSolve needs to be used. */ 979 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 980 if (perm_identity) { 981 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 982 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 983 B->ops->matsolve = NULL; 984 B->ops->matsolvetranspose = NULL; 985 } else { 986 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 987 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 988 B->ops->matsolve = NULL; 989 B->ops->matsolvetranspose = NULL; 990 } 991 992 /* get the triangular factors */ 993 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 994 PetscFunctionReturn(0); 995 } 996 997 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 998 { 999 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1000 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1001 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1002 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1003 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1004 cusparseStatus_t stat; 1005 cusparseIndexBase_t indexBase; 1006 cusparseMatrixType_t matrixType; 1007 cusparseFillMode_t fillMode; 1008 cusparseDiagType_t diagType; 1009 cudaError_t cerr; 1010 PetscErrorCode ierr; 1011 1012 PetscFunctionBegin; 1013 /* allocate space for the transpose of the lower triangular factor */ 1014 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1015 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1016 1017 /* set the matrix descriptors of the lower triangular factor */ 1018 matrixType = cusparseGetMatType(loTriFactor->descr); 1019 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1020 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1021 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1022 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1023 1024 /* Create the matrix description */ 1025 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1026 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1027 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1028 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1029 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1030 1031 /* set the operation */ 1032 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1033 1034 /* allocate GPU space for the CSC of the lower triangular factor*/ 1035 loTriFactorT->csrMat = new CsrMatrix; 1036 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1037 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1038 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1039 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1040 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1041 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1042 1043 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1044 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1045 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1046 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1047 loTriFactor->csrMat->values->data().get(), 1048 loTriFactor->csrMat->row_offsets->data().get(), 1049 loTriFactor->csrMat->column_indices->data().get(), 1050 loTriFactorT->csrMat->values->data().get(), 1051 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1052 CUSPARSE_ACTION_NUMERIC,indexBase, 1053 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1054 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1055 #endif 1056 1057 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1058 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1059 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1060 loTriFactor->csrMat->values->data().get(), 1061 loTriFactor->csrMat->row_offsets->data().get(), 1062 loTriFactor->csrMat->column_indices->data().get(), 1063 loTriFactorT->csrMat->values->data().get(), 1064 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1065 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1066 CUSPARSE_ACTION_NUMERIC, indexBase, 1067 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 1068 #else 1069 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1070 CUSPARSE_ACTION_NUMERIC, indexBase 1071 #endif 1072 );CHKERRCUSPARSE(stat); 1073 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1074 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1075 1076 /* Create the solve analysis information */ 1077 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1078 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1079 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1080 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1081 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1082 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1083 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1084 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1085 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1086 #endif 1087 1088 /* perform the solve analysis */ 1089 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1090 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1091 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1092 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 1093 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1094 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1095 #endif 1096 );CHKERRCUSPARSE(stat); 1097 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1098 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1099 1100 /* assign the pointer */ 1101 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1102 1103 /*********************************************/ 1104 /* Now the Transpose of the Upper Tri Factor */ 1105 /*********************************************/ 1106 1107 /* allocate space for the transpose of the upper triangular factor */ 1108 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1109 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1110 1111 /* set the matrix descriptors of the upper triangular factor */ 1112 matrixType = cusparseGetMatType(upTriFactor->descr); 1113 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1114 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1115 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1116 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1117 1118 /* Create the matrix description */ 1119 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1120 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1121 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1122 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1123 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1124 1125 /* set the operation */ 1126 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1127 1128 /* allocate GPU space for the CSC of the upper triangular factor*/ 1129 upTriFactorT->csrMat = new CsrMatrix; 1130 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1131 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1132 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1133 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1134 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1135 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1136 1137 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1138 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1139 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1140 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1141 upTriFactor->csrMat->values->data().get(), 1142 upTriFactor->csrMat->row_offsets->data().get(), 1143 upTriFactor->csrMat->column_indices->data().get(), 1144 upTriFactorT->csrMat->values->data().get(), 1145 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1146 CUSPARSE_ACTION_NUMERIC,indexBase, 1147 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1148 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1149 #endif 1150 1151 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1152 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1153 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1154 upTriFactor->csrMat->values->data().get(), 1155 upTriFactor->csrMat->row_offsets->data().get(), 1156 upTriFactor->csrMat->column_indices->data().get(), 1157 upTriFactorT->csrMat->values->data().get(), 1158 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1159 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1160 CUSPARSE_ACTION_NUMERIC, indexBase, 1161 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1162 #else 1163 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1164 CUSPARSE_ACTION_NUMERIC, indexBase 1165 #endif 1166 );CHKERRCUSPARSE(stat); 1167 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1168 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1169 1170 /* Create the solve analysis information */ 1171 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1172 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1173 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1174 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1175 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1176 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1177 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1178 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1179 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1180 #endif 1181 1182 /* perform the solve analysis */ 1183 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1184 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1185 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1186 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1187 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1188 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1189 #endif 1190 );CHKERRCUSPARSE(stat); 1191 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1192 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1193 1194 /* assign the pointer */ 1195 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1196 PetscFunctionReturn(0); 1197 } 1198 1199 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 1200 { 1201 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1202 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1203 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1204 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1205 cusparseStatus_t stat; 1206 cusparseIndexBase_t indexBase; 1207 cudaError_t err; 1208 PetscErrorCode ierr; 1209 1210 PetscFunctionBegin; 1211 if (!cusparsestruct->transgen || cusparsestruct->matTranspose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 1212 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1213 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1214 /* create cusparse matrix */ 1215 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1216 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1217 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1218 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1219 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1220 1221 /* set alpha and beta */ 1222 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1223 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1224 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1225 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1226 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1227 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1228 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1229 1230 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1231 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1232 CsrMatrix *matrixT= new CsrMatrix; 1233 matrixT->num_rows = A->cmap->n; 1234 matrixT->num_cols = A->rmap->n; 1235 matrixT->num_entries = a->nz; 1236 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1237 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1238 matrixT->values = new THRUSTARRAY(a->nz); 1239 1240 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1241 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1242 1243 /* compute the transpose, i.e. the CSC */ 1244 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1245 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1246 A->cmap->n, matrix->num_entries, 1247 matrix->values->data().get(), 1248 cusparsestruct->rowoffsets_gpu->data().get(), 1249 matrix->column_indices->data().get(), 1250 matrixT->values->data().get(), 1251 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1252 CUSPARSE_ACTION_NUMERIC,indexBase, 1253 cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1254 err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err); 1255 #endif 1256 1257 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1258 A->cmap->n, matrix->num_entries, 1259 matrix->values->data().get(), 1260 cusparsestruct->rowoffsets_gpu->data().get(), 1261 matrix->column_indices->data().get(), 1262 matrixT->values->data().get(), 1263 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1264 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1265 CUSPARSE_ACTION_NUMERIC,indexBase, 1266 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1267 #else 1268 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1269 CUSPARSE_ACTION_NUMERIC, indexBase 1270 #endif 1271 );CHKERRCUSPARSE(stat); 1272 matstructT->mat = matrixT; 1273 1274 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1275 stat = cusparseCreateCsr(&matstructT->matDescr, 1276 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1277 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1278 matrixT->values->data().get(), 1279 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1280 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1281 #endif 1282 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1283 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1284 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1285 #else 1286 CsrMatrix *temp = new CsrMatrix; 1287 CsrMatrix *tempT = new CsrMatrix; 1288 /* First convert HYB to CSR */ 1289 temp->num_rows = A->rmap->n; 1290 temp->num_cols = A->cmap->n; 1291 temp->num_entries = a->nz; 1292 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1293 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1294 temp->values = new THRUSTARRAY(a->nz); 1295 1296 stat = cusparse_hyb2csr(cusparsestruct->handle, 1297 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1298 temp->values->data().get(), 1299 temp->row_offsets->data().get(), 1300 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1301 1302 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1303 tempT->num_rows = A->rmap->n; 1304 tempT->num_cols = A->cmap->n; 1305 tempT->num_entries = a->nz; 1306 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1307 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1308 tempT->values = new THRUSTARRAY(a->nz); 1309 1310 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1311 temp->num_cols, temp->num_entries, 1312 temp->values->data().get(), 1313 temp->row_offsets->data().get(), 1314 temp->column_indices->data().get(), 1315 tempT->values->data().get(), 1316 tempT->column_indices->data().get(), 1317 tempT->row_offsets->data().get(), 1318 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1319 1320 /* Last, convert CSC to HYB */ 1321 cusparseHybMat_t hybMat; 1322 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1323 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1324 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1325 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1326 matstructT->descr, tempT->values->data().get(), 1327 tempT->row_offsets->data().get(), 1328 tempT->column_indices->data().get(), 1329 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1330 1331 /* assign the pointer */ 1332 matstructT->mat = hybMat; 1333 /* delete temporaries */ 1334 if (tempT) { 1335 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1336 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1337 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1338 delete (CsrMatrix*) tempT; 1339 } 1340 if (temp) { 1341 if (temp->values) delete (THRUSTARRAY*) temp->values; 1342 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1343 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1344 delete (CsrMatrix*) temp; 1345 } 1346 #endif 1347 } 1348 err = WaitForCUDA();CHKERRCUDA(err); 1349 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1350 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1351 /* the compressed row indices is not used for matTranspose */ 1352 matstructT->cprowIndices = NULL; 1353 /* assign the pointer */ 1354 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1355 PetscFunctionReturn(0); 1356 } 1357 1358 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1359 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1360 { 1361 PetscInt n = xx->map->n; 1362 const PetscScalar *barray; 1363 PetscScalar *xarray; 1364 thrust::device_ptr<const PetscScalar> bGPU; 1365 thrust::device_ptr<PetscScalar> xGPU; 1366 cusparseStatus_t stat; 1367 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1368 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1369 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1370 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1371 PetscErrorCode ierr; 1372 cudaError_t cerr; 1373 1374 PetscFunctionBegin; 1375 /* Analyze the matrix and create the transpose ... on the fly */ 1376 if (!loTriFactorT && !upTriFactorT) { 1377 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1378 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1379 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1380 } 1381 1382 /* Get the GPU pointers */ 1383 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1384 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1385 xGPU = thrust::device_pointer_cast(xarray); 1386 bGPU = thrust::device_pointer_cast(barray); 1387 1388 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1389 /* First, reorder with the row permutation */ 1390 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1391 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1392 xGPU); 1393 1394 /* First, solve U */ 1395 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1396 upTriFactorT->csrMat->num_rows, 1397 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1398 upTriFactorT->csrMat->num_entries, 1399 #endif 1400 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1401 upTriFactorT->csrMat->values->data().get(), 1402 upTriFactorT->csrMat->row_offsets->data().get(), 1403 upTriFactorT->csrMat->column_indices->data().get(), 1404 upTriFactorT->solveInfo, 1405 xarray, tempGPU->data().get() 1406 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1407 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1408 #endif 1409 );CHKERRCUSPARSE(stat); 1410 1411 /* Then, solve L */ 1412 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1413 loTriFactorT->csrMat->num_rows, 1414 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1415 loTriFactorT->csrMat->num_entries, 1416 #endif 1417 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1418 loTriFactorT->csrMat->values->data().get(), 1419 loTriFactorT->csrMat->row_offsets->data().get(), 1420 loTriFactorT->csrMat->column_indices->data().get(), 1421 loTriFactorT->solveInfo, 1422 tempGPU->data().get(), xarray 1423 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1424 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1425 #endif 1426 );CHKERRCUSPARSE(stat); 1427 1428 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1429 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1430 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1431 tempGPU->begin()); 1432 1433 /* Copy the temporary to the full solution. */ 1434 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1435 1436 /* restore */ 1437 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1438 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1439 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1440 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1441 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1442 PetscFunctionReturn(0); 1443 } 1444 1445 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1446 { 1447 const PetscScalar *barray; 1448 PetscScalar *xarray; 1449 cusparseStatus_t stat; 1450 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1451 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1452 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1453 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1454 PetscErrorCode ierr; 1455 cudaError_t cerr; 1456 1457 PetscFunctionBegin; 1458 /* Analyze the matrix and create the transpose ... on the fly */ 1459 if (!loTriFactorT && !upTriFactorT) { 1460 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1461 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1462 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1463 } 1464 1465 /* Get the GPU pointers */ 1466 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1467 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1468 1469 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1470 /* First, solve U */ 1471 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1472 upTriFactorT->csrMat->num_rows, 1473 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1474 upTriFactorT->csrMat->num_entries, 1475 #endif 1476 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1477 upTriFactorT->csrMat->values->data().get(), 1478 upTriFactorT->csrMat->row_offsets->data().get(), 1479 upTriFactorT->csrMat->column_indices->data().get(), 1480 upTriFactorT->solveInfo, 1481 barray, tempGPU->data().get() 1482 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1483 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1484 #endif 1485 );CHKERRCUSPARSE(stat); 1486 1487 /* Then, solve L */ 1488 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1489 loTriFactorT->csrMat->num_rows, 1490 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1491 loTriFactorT->csrMat->num_entries, 1492 #endif 1493 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1494 loTriFactorT->csrMat->values->data().get(), 1495 loTriFactorT->csrMat->row_offsets->data().get(), 1496 loTriFactorT->csrMat->column_indices->data().get(), 1497 loTriFactorT->solveInfo, 1498 tempGPU->data().get(), xarray 1499 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1500 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1501 #endif 1502 );CHKERRCUSPARSE(stat); 1503 1504 /* restore */ 1505 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1506 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1507 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1508 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1509 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1510 PetscFunctionReturn(0); 1511 } 1512 1513 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1514 { 1515 const PetscScalar *barray; 1516 PetscScalar *xarray; 1517 thrust::device_ptr<const PetscScalar> bGPU; 1518 thrust::device_ptr<PetscScalar> xGPU; 1519 cusparseStatus_t stat; 1520 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1521 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1522 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1523 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1524 PetscErrorCode ierr; 1525 cudaError_t cerr; 1526 1527 PetscFunctionBegin; 1528 1529 /* Get the GPU pointers */ 1530 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1531 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1532 xGPU = thrust::device_pointer_cast(xarray); 1533 bGPU = thrust::device_pointer_cast(barray); 1534 1535 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1536 /* First, reorder with the row permutation */ 1537 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1538 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1539 tempGPU->begin()); 1540 1541 /* Next, solve L */ 1542 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1543 loTriFactor->csrMat->num_rows, 1544 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1545 loTriFactor->csrMat->num_entries, 1546 #endif 1547 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1548 loTriFactor->csrMat->values->data().get(), 1549 loTriFactor->csrMat->row_offsets->data().get(), 1550 loTriFactor->csrMat->column_indices->data().get(), 1551 loTriFactor->solveInfo, 1552 tempGPU->data().get(), xarray 1553 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1554 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1555 #endif 1556 );CHKERRCUSPARSE(stat); 1557 1558 /* Then, solve U */ 1559 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1560 upTriFactor->csrMat->num_rows, 1561 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1562 upTriFactor->csrMat->num_entries, 1563 #endif 1564 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1565 upTriFactor->csrMat->values->data().get(), 1566 upTriFactor->csrMat->row_offsets->data().get(), 1567 upTriFactor->csrMat->column_indices->data().get(), 1568 upTriFactor->solveInfo, 1569 xarray, tempGPU->data().get() 1570 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1571 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1572 #endif 1573 );CHKERRCUSPARSE(stat); 1574 1575 /* Last, reorder with the column permutation */ 1576 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1577 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1578 xGPU); 1579 1580 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1581 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1582 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1583 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1584 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1585 PetscFunctionReturn(0); 1586 } 1587 1588 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1589 { 1590 const PetscScalar *barray; 1591 PetscScalar *xarray; 1592 cusparseStatus_t stat; 1593 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1594 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1595 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1596 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1597 PetscErrorCode ierr; 1598 cudaError_t cerr; 1599 1600 PetscFunctionBegin; 1601 /* Get the GPU pointers */ 1602 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1603 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1604 1605 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1606 /* First, solve L */ 1607 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1608 loTriFactor->csrMat->num_rows, 1609 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1610 loTriFactor->csrMat->num_entries, 1611 #endif 1612 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1613 loTriFactor->csrMat->values->data().get(), 1614 loTriFactor->csrMat->row_offsets->data().get(), 1615 loTriFactor->csrMat->column_indices->data().get(), 1616 loTriFactor->solveInfo, 1617 barray, tempGPU->data().get() 1618 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1619 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1620 #endif 1621 );CHKERRCUSPARSE(stat); 1622 1623 /* Next, solve U */ 1624 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1625 upTriFactor->csrMat->num_rows, 1626 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1627 upTriFactor->csrMat->num_entries, 1628 #endif 1629 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1630 upTriFactor->csrMat->values->data().get(), 1631 upTriFactor->csrMat->row_offsets->data().get(), 1632 upTriFactor->csrMat->column_indices->data().get(), 1633 upTriFactor->solveInfo, 1634 tempGPU->data().get(), xarray 1635 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1636 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1637 #endif 1638 );CHKERRCUSPARSE(stat); 1639 1640 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1641 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1642 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1643 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1644 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1645 PetscFunctionReturn(0); 1646 } 1647 1648 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1649 { 1650 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1651 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1652 cudaError_t cerr; 1653 PetscErrorCode ierr; 1654 1655 PetscFunctionBegin; 1656 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1657 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1658 1659 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1660 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1661 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1662 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1663 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1664 A->offloadmask = PETSC_OFFLOAD_BOTH; 1665 } 1666 PetscFunctionReturn(0); 1667 } 1668 1669 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1670 { 1671 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1672 PetscErrorCode ierr; 1673 1674 PetscFunctionBegin; 1675 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1676 *array = a->a; 1677 A->offloadmask = PETSC_OFFLOAD_CPU; 1678 PetscFunctionReturn(0); 1679 } 1680 1681 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1682 { 1683 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1684 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1685 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1686 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1687 PetscErrorCode ierr; 1688 cusparseStatus_t stat; 1689 PetscBool both = PETSC_TRUE; 1690 cudaError_t err; 1691 1692 PetscFunctionBegin; 1693 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU"); 1694 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1695 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1696 /* Copy values only */ 1697 CsrMatrix *matrix,*matrixT; 1698 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1699 1700 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values"); 1701 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1702 matrix->values->assign(a->a, a->a+a->nz); 1703 err = WaitForCUDA();CHKERRCUDA(err); 1704 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1705 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1706 1707 /* Update matT when it was built before */ 1708 if (cusparsestruct->matTranspose) { 1709 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1710 matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1711 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1712 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1713 A->cmap->n, matrix->num_entries, 1714 matrix->values->data().get(), 1715 cusparsestruct->rowoffsets_gpu->data().get(), 1716 matrix->column_indices->data().get(), 1717 matrixT->values->data().get(), 1718 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1719 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1720 CUSPARSE_ACTION_NUMERIC,indexBase, 1721 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1722 #else 1723 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1724 CUSPARSE_ACTION_NUMERIC, indexBase 1725 #endif 1726 );CHKERRCUSPARSE(stat); 1727 err = WaitForCUDA();CHKERRCUDA(err); 1728 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1729 } 1730 } else { 1731 PetscInt nnz; 1732 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1733 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1734 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1735 delete cusparsestruct->workVector; 1736 delete cusparsestruct->rowoffsets_gpu; 1737 try { 1738 if (a->compressedrow.use) { 1739 m = a->compressedrow.nrows; 1740 ii = a->compressedrow.i; 1741 ridx = a->compressedrow.rindex; 1742 } else { 1743 m = A->rmap->n; 1744 ii = a->i; 1745 ridx = NULL; 1746 } 1747 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data"); 1748 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data"); 1749 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1750 else nnz = a->nz; 1751 1752 /* create cusparse matrix */ 1753 cusparsestruct->nrows = m; 1754 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1755 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1756 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1757 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1758 1759 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1760 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1761 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1762 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1763 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1764 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1765 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1766 1767 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1768 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1769 /* set the matrix */ 1770 CsrMatrix *mat= new CsrMatrix; 1771 mat->num_rows = m; 1772 mat->num_cols = A->cmap->n; 1773 mat->num_entries = nnz; 1774 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1775 mat->row_offsets->assign(ii, ii + m+1); 1776 1777 mat->column_indices = new THRUSTINTARRAY32(nnz); 1778 mat->column_indices->assign(a->j, a->j+nnz); 1779 1780 mat->values = new THRUSTARRAY(nnz); 1781 if (a->a) mat->values->assign(a->a, a->a+nnz); 1782 1783 /* assign the pointer */ 1784 matstruct->mat = mat; 1785 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1786 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1787 stat = cusparseCreateCsr(&matstruct->matDescr, 1788 mat->num_rows, mat->num_cols, mat->num_entries, 1789 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1790 mat->values->data().get(), 1791 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1792 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1793 } 1794 #endif 1795 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1796 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1797 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1798 #else 1799 CsrMatrix *mat= new CsrMatrix; 1800 mat->num_rows = m; 1801 mat->num_cols = A->cmap->n; 1802 mat->num_entries = nnz; 1803 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1804 mat->row_offsets->assign(ii, ii + m+1); 1805 1806 mat->column_indices = new THRUSTINTARRAY32(nnz); 1807 mat->column_indices->assign(a->j, a->j+nnz); 1808 1809 mat->values = new THRUSTARRAY(nnz); 1810 if (a->a) mat->values->assign(a->a, a->a+nnz); 1811 1812 cusparseHybMat_t hybMat; 1813 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1814 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1815 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1816 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1817 matstruct->descr, mat->values->data().get(), 1818 mat->row_offsets->data().get(), 1819 mat->column_indices->data().get(), 1820 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1821 /* assign the pointer */ 1822 matstruct->mat = hybMat; 1823 1824 if (mat) { 1825 if (mat->values) delete (THRUSTARRAY*)mat->values; 1826 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1827 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1828 delete (CsrMatrix*)mat; 1829 } 1830 #endif 1831 } 1832 1833 /* assign the compressed row indices */ 1834 if (a->compressedrow.use) { 1835 cusparsestruct->workVector = new THRUSTARRAY(m); 1836 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1837 matstruct->cprowIndices->assign(ridx,ridx+m); 1838 tmp = m; 1839 } else { 1840 cusparsestruct->workVector = NULL; 1841 matstruct->cprowIndices = NULL; 1842 tmp = 0; 1843 } 1844 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1845 1846 /* assign the pointer */ 1847 cusparsestruct->mat = matstruct; 1848 } catch(char *ex) { 1849 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1850 } 1851 err = WaitForCUDA();CHKERRCUDA(err); 1852 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1853 cusparsestruct->nonzerostate = A->nonzerostate; 1854 } 1855 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1856 } 1857 PetscFunctionReturn(0); 1858 } 1859 1860 struct VecCUDAPlusEquals 1861 { 1862 template <typename Tuple> 1863 __host__ __device__ 1864 void operator()(Tuple t) 1865 { 1866 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1867 } 1868 }; 1869 1870 struct VecCUDAEquals 1871 { 1872 template <typename Tuple> 1873 __host__ __device__ 1874 void operator()(Tuple t) 1875 { 1876 thrust::get<1>(t) = thrust::get<0>(t); 1877 } 1878 }; 1879 1880 struct VecCUDAEqualsReverse 1881 { 1882 template <typename Tuple> 1883 __host__ __device__ 1884 void operator()(Tuple t) 1885 { 1886 thrust::get<0>(t) = thrust::get<1>(t); 1887 } 1888 }; 1889 1890 struct MatMatCusparse { 1891 PetscBool cisdense; 1892 PetscScalar *Bt; 1893 Mat X; 1894 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1895 PetscLogDouble flops; 1896 CsrMatrix *Bcsr; 1897 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1898 cusparseSpMatDescr_t matSpBDescr; 1899 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1900 cusparseDnMatDescr_t matBDescr; 1901 cusparseDnMatDescr_t matCDescr; 1902 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1903 size_t mmBufferSize; 1904 void *mmBuffer; 1905 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1906 cusparseSpGEMMDescr_t spgemmDesc; 1907 #endif 1908 }; 1909 1910 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1911 { 1912 PetscErrorCode ierr; 1913 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1914 cudaError_t cerr; 1915 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1916 cusparseStatus_t stat; 1917 #endif 1918 1919 PetscFunctionBegin; 1920 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1921 delete mmdata->Bcsr; 1922 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1923 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1924 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1925 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1926 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1927 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1928 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1929 #endif 1930 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1931 ierr = PetscFree(data);CHKERRQ(ierr); 1932 PetscFunctionReturn(0); 1933 } 1934 1935 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1936 1937 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1938 { 1939 Mat_Product *product = C->product; 1940 Mat A,B; 1941 PetscInt m,n,blda,clda; 1942 PetscBool flg,biscuda; 1943 Mat_SeqAIJCUSPARSE *cusp; 1944 cusparseStatus_t stat; 1945 cusparseOperation_t opA; 1946 const PetscScalar *barray; 1947 PetscScalar *carray; 1948 PetscErrorCode ierr; 1949 MatMatCusparse *mmdata; 1950 Mat_SeqAIJCUSPARSEMultStruct *mat; 1951 CsrMatrix *csrmat; 1952 cudaError_t cerr; 1953 1954 PetscFunctionBegin; 1955 MatCheckProduct(C,1); 1956 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1957 mmdata = (MatMatCusparse*)product->data; 1958 A = product->A; 1959 B = product->B; 1960 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1961 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1962 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1963 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1964 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1965 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1966 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1967 switch (product->type) { 1968 case MATPRODUCT_AB: 1969 case MATPRODUCT_PtAP: 1970 mat = cusp->mat; 1971 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1972 m = A->rmap->n; 1973 n = B->cmap->n; 1974 break; 1975 case MATPRODUCT_AtB: 1976 if (!cusp->transgen) { 1977 mat = cusp->mat; 1978 opA = CUSPARSE_OPERATION_TRANSPOSE; 1979 } else { 1980 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1981 mat = cusp->matTranspose; 1982 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1983 } 1984 m = A->cmap->n; 1985 n = B->cmap->n; 1986 break; 1987 case MATPRODUCT_ABt: 1988 case MATPRODUCT_RARt: 1989 mat = cusp->mat; 1990 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1991 m = A->rmap->n; 1992 n = B->rmap->n; 1993 break; 1994 default: 1995 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1996 } 1997 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1998 csrmat = (CsrMatrix*)mat->mat; 1999 /* if the user passed a CPU matrix, copy the data to the GPU */ 2000 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2001 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2002 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2003 2004 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2005 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2006 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2007 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2008 } else { 2009 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2010 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2011 } 2012 2013 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2014 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2015 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2016 /* (re)allcoate mmBuffer if not initialized or LDAs are different */ 2017 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2018 size_t mmBufferSize; 2019 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2020 if (!mmdata->matBDescr) { 2021 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2022 mmdata->Blda = blda; 2023 } 2024 2025 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2026 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2027 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2028 mmdata->Clda = clda; 2029 } 2030 2031 if (!mat->matDescr) { 2032 stat = cusparseCreateCsr(&mat->matDescr, 2033 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2034 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2035 csrmat->values->data().get(), 2036 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2037 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2038 } 2039 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2040 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2041 mmdata->matCDescr,cusparse_scalartype, 2042 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2043 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2044 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2045 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2046 mmdata->mmBufferSize = mmBufferSize; 2047 } 2048 mmdata->initialized = PETSC_TRUE; 2049 } else { 2050 /* to be safe, always update pointers of the mats */ 2051 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2052 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2053 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2054 } 2055 2056 /* do cusparseSpMM, which supports transpose on B */ 2057 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2058 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2059 mmdata->matCDescr,cusparse_scalartype, 2060 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2061 #else 2062 PetscInt k; 2063 /* cusparseXcsrmm does not support transpose on B */ 2064 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2065 cublasHandle_t cublasv2handle; 2066 cublasStatus_t cerr; 2067 2068 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2069 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2070 B->cmap->n,B->rmap->n, 2071 &PETSC_CUSPARSE_ONE ,barray,blda, 2072 &PETSC_CUSPARSE_ZERO,barray,blda, 2073 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2074 blda = B->cmap->n; 2075 k = B->cmap->n; 2076 } else { 2077 k = B->rmap->n; 2078 } 2079 2080 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2081 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2082 csrmat->num_entries,mat->alpha_one,mat->descr, 2083 csrmat->values->data().get(), 2084 csrmat->row_offsets->data().get(), 2085 csrmat->column_indices->data().get(), 2086 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2087 carray,clda);CHKERRCUSPARSE(stat); 2088 #endif 2089 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2090 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2091 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2092 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2093 if (product->type == MATPRODUCT_RARt) { 2094 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2095 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2096 } else if (product->type == MATPRODUCT_PtAP) { 2097 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2098 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2099 } else { 2100 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2101 } 2102 if (mmdata->cisdense) { 2103 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2104 } 2105 if (!biscuda) { 2106 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2107 } 2108 PetscFunctionReturn(0); 2109 } 2110 2111 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2112 { 2113 Mat_Product *product = C->product; 2114 Mat A,B; 2115 PetscInt m,n; 2116 PetscBool cisdense,flg; 2117 PetscErrorCode ierr; 2118 MatMatCusparse *mmdata; 2119 Mat_SeqAIJCUSPARSE *cusp; 2120 2121 PetscFunctionBegin; 2122 MatCheckProduct(C,1); 2123 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2124 A = product->A; 2125 B = product->B; 2126 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2127 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2128 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2129 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2130 switch (product->type) { 2131 case MATPRODUCT_AB: 2132 m = A->rmap->n; 2133 n = B->cmap->n; 2134 break; 2135 case MATPRODUCT_AtB: 2136 m = A->cmap->n; 2137 n = B->cmap->n; 2138 break; 2139 case MATPRODUCT_ABt: 2140 m = A->rmap->n; 2141 n = B->rmap->n; 2142 break; 2143 case MATPRODUCT_PtAP: 2144 m = B->cmap->n; 2145 n = B->cmap->n; 2146 break; 2147 case MATPRODUCT_RARt: 2148 m = B->rmap->n; 2149 n = B->rmap->n; 2150 break; 2151 default: 2152 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2153 } 2154 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2155 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2156 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2157 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2158 2159 /* product data */ 2160 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2161 mmdata->cisdense = cisdense; 2162 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2163 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2164 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2165 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2166 } 2167 #endif 2168 /* for these products we need intermediate storage */ 2169 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2170 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2171 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2172 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2173 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2174 } else { 2175 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2176 } 2177 } 2178 C->product->data = mmdata; 2179 C->product->destroy = MatDestroy_MatMatCusparse; 2180 2181 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2182 PetscFunctionReturn(0); 2183 } 2184 2185 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2186 { 2187 Mat_Product *product = C->product; 2188 Mat A,B; 2189 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2190 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2191 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2192 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2193 PetscBool flg; 2194 PetscErrorCode ierr; 2195 cusparseStatus_t stat; 2196 cudaError_t cerr; 2197 MatProductType ptype; 2198 MatMatCusparse *mmdata; 2199 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2200 cusparseSpMatDescr_t BmatSpDescr; 2201 #endif 2202 2203 PetscFunctionBegin; 2204 MatCheckProduct(C,1); 2205 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 2206 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2207 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name); 2208 mmdata = (MatMatCusparse*)C->product->data; 2209 A = product->A; 2210 B = product->B; 2211 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2212 mmdata->reusesym = PETSC_FALSE; 2213 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2214 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2215 Cmat = Ccusp->mat; 2216 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2217 Ccsr = (CsrMatrix*)Cmat->mat; 2218 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2219 goto finalize; 2220 } 2221 if (!c->nz) goto finalize; 2222 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2223 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2224 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2225 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2226 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2227 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2228 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2229 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2230 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2231 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2232 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2233 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2234 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2235 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2236 2237 ptype = product->type; 2238 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2239 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2240 switch (ptype) { 2241 case MATPRODUCT_AB: 2242 Amat = Acusp->mat; 2243 Bmat = Bcusp->mat; 2244 break; 2245 case MATPRODUCT_AtB: 2246 Amat = Acusp->matTranspose; 2247 Bmat = Bcusp->mat; 2248 break; 2249 case MATPRODUCT_ABt: 2250 Amat = Acusp->mat; 2251 Bmat = Bcusp->matTranspose; 2252 break; 2253 default: 2254 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2255 } 2256 Cmat = Ccusp->mat; 2257 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2258 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2259 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2260 Acsr = (CsrMatrix*)Amat->mat; 2261 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2262 Ccsr = (CsrMatrix*)Cmat->mat; 2263 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2264 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2265 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2266 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2267 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2268 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2269 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2270 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2271 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2272 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2273 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2274 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2275 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2276 #else 2277 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2278 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2279 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2280 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2281 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2282 #endif 2283 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2284 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2285 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2286 C->offloadmask = PETSC_OFFLOAD_GPU; 2287 finalize: 2288 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2289 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2290 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2291 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2292 c->reallocs = 0; 2293 C->info.mallocs += 0; 2294 C->info.nz_unneeded = 0; 2295 C->assembled = C->was_assembled = PETSC_TRUE; 2296 C->num_ass++; 2297 PetscFunctionReturn(0); 2298 } 2299 2300 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2301 { 2302 Mat_Product *product = C->product; 2303 Mat A,B; 2304 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2305 Mat_SeqAIJ *a,*b,*c; 2306 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2307 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2308 PetscInt i,j,m,n,k; 2309 PetscBool flg; 2310 PetscErrorCode ierr; 2311 cusparseStatus_t stat; 2312 cudaError_t cerr; 2313 MatProductType ptype; 2314 MatMatCusparse *mmdata; 2315 PetscLogDouble flops; 2316 PetscBool biscompressed,ciscompressed; 2317 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2318 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2319 size_t bufSize2; 2320 cusparseSpMatDescr_t BmatSpDescr; 2321 #else 2322 int cnz; 2323 #endif 2324 2325 PetscFunctionBegin; 2326 MatCheckProduct(C,1); 2327 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2328 A = product->A; 2329 B = product->B; 2330 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2331 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2332 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2333 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2334 a = (Mat_SeqAIJ*)A->data; 2335 b = (Mat_SeqAIJ*)B->data; 2336 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2337 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2338 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2339 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2340 2341 /* product data */ 2342 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2343 C->product->data = mmdata; 2344 C->product->destroy = MatDestroy_MatMatCusparse; 2345 2346 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2347 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2348 ptype = product->type; 2349 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2350 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2351 biscompressed = PETSC_FALSE; 2352 ciscompressed = PETSC_FALSE; 2353 switch (ptype) { 2354 case MATPRODUCT_AB: 2355 m = A->rmap->n; 2356 n = B->cmap->n; 2357 k = A->cmap->n; 2358 Amat = Acusp->mat; 2359 Bmat = Bcusp->mat; 2360 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2361 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2362 break; 2363 case MATPRODUCT_AtB: 2364 m = A->cmap->n; 2365 n = B->cmap->n; 2366 k = A->rmap->n; 2367 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 2368 Amat = Acusp->matTranspose; 2369 Bmat = Bcusp->mat; 2370 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2371 break; 2372 case MATPRODUCT_ABt: 2373 m = A->rmap->n; 2374 n = B->rmap->n; 2375 k = A->cmap->n; 2376 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr); 2377 Amat = Acusp->mat; 2378 Bmat = Bcusp->matTranspose; 2379 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2380 break; 2381 default: 2382 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2383 } 2384 2385 /* create cusparse matrix */ 2386 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2387 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2388 c = (Mat_SeqAIJ*)C->data; 2389 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2390 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2391 Ccsr = new CsrMatrix; 2392 2393 c->compressedrow.use = ciscompressed; 2394 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2395 c->compressedrow.nrows = a->compressedrow.nrows; 2396 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2397 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2398 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2399 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2400 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2401 } else { 2402 c->compressedrow.nrows = 0; 2403 c->compressedrow.i = NULL; 2404 c->compressedrow.rindex = NULL; 2405 Ccusp->workVector = NULL; 2406 Cmat->cprowIndices = NULL; 2407 } 2408 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2409 Ccusp->mat = Cmat; 2410 Ccusp->mat->mat = Ccsr; 2411 Ccsr->num_rows = Ccusp->nrows; 2412 Ccsr->num_cols = n; 2413 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2414 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2415 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2416 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2417 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2418 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2419 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2420 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2421 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2422 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2423 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2424 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2425 c->nz = 0; 2426 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2427 Ccsr->values = new THRUSTARRAY(c->nz); 2428 goto finalizesym; 2429 } 2430 2431 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2432 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2433 Acsr = (CsrMatrix*)Amat->mat; 2434 if (!biscompressed) { 2435 Bcsr = (CsrMatrix*)Bmat->mat; 2436 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2437 BmatSpDescr = Bmat->matDescr; 2438 #endif 2439 } else { /* we need to use row offsets for the full matrix */ 2440 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2441 Bcsr = new CsrMatrix; 2442 Bcsr->num_rows = B->rmap->n; 2443 Bcsr->num_cols = cBcsr->num_cols; 2444 Bcsr->num_entries = cBcsr->num_entries; 2445 Bcsr->column_indices = cBcsr->column_indices; 2446 Bcsr->values = cBcsr->values; 2447 if (!Bcusp->rowoffsets_gpu) { 2448 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2449 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2450 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2451 } 2452 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2453 mmdata->Bcsr = Bcsr; 2454 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2455 if (Bcsr->num_rows && Bcsr->num_cols) { 2456 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2457 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2458 Bcsr->values->data().get(), 2459 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2460 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2461 } 2462 BmatSpDescr = mmdata->matSpBDescr; 2463 #endif 2464 } 2465 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2466 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2467 /* precompute flops count */ 2468 if (ptype == MATPRODUCT_AB) { 2469 for (i=0, flops = 0; i<A->rmap->n; i++) { 2470 const PetscInt st = a->i[i]; 2471 const PetscInt en = a->i[i+1]; 2472 for (j=st; j<en; j++) { 2473 const PetscInt brow = a->j[j]; 2474 flops += 2.*(b->i[brow+1] - b->i[brow]); 2475 } 2476 } 2477 } else if (ptype == MATPRODUCT_AtB) { 2478 for (i=0, flops = 0; i<A->rmap->n; i++) { 2479 const PetscInt anzi = a->i[i+1] - a->i[i]; 2480 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2481 flops += (2.*anzi)*bnzi; 2482 } 2483 } else { /* TODO */ 2484 flops = 0.; 2485 } 2486 2487 mmdata->flops = flops; 2488 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2489 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2490 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2491 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2492 NULL, NULL, NULL, 2493 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2494 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2495 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2496 /* ask bufferSize bytes for external memory */ 2497 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2498 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2499 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2500 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2501 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2502 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2503 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2504 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2505 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2506 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2507 /* ask bufferSize again bytes for external memory */ 2508 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2509 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2510 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2511 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2512 /* The CUSPARSE documentation is not clear, nor the API 2513 We need both buffers to perform the operations properly! 2514 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2515 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2516 is stored in the descriptor! What a messy API... */ 2517 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2518 /* compute the intermediate product of A * B */ 2519 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2520 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2521 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2522 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2523 /* get matrix C non-zero entries C_nnz1 */ 2524 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2525 c->nz = (PetscInt) C_nnz1; 2526 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2527 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2528 Ccsr->values = new THRUSTARRAY(c->nz); 2529 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2530 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2531 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2532 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2533 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2534 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2535 #else 2536 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2537 stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2538 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2539 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2540 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2541 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2542 c->nz = cnz; 2543 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2544 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2545 Ccsr->values = new THRUSTARRAY(c->nz); 2546 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2547 2548 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2549 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2550 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2551 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2552 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2553 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2554 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2555 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2556 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2557 #endif 2558 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2559 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2560 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2561 finalizesym: 2562 c->singlemalloc = PETSC_FALSE; 2563 c->free_a = PETSC_TRUE; 2564 c->free_ij = PETSC_TRUE; 2565 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2566 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2567 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2568 PetscInt *d_i = c->i; 2569 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2570 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2571 ii = *Ccsr->row_offsets; 2572 jj = *Ccsr->column_indices; 2573 if (ciscompressed) d_i = c->compressedrow.i; 2574 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2575 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2576 } else { 2577 PetscInt *d_i = c->i; 2578 if (ciscompressed) d_i = c->compressedrow.i; 2579 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2580 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2581 } 2582 if (ciscompressed) { /* need to expand host row offsets */ 2583 PetscInt r = 0; 2584 c->i[0] = 0; 2585 for (k = 0; k < c->compressedrow.nrows; k++) { 2586 const PetscInt next = c->compressedrow.rindex[k]; 2587 const PetscInt old = c->compressedrow.i[k]; 2588 for (; r < next; r++) c->i[r+1] = old; 2589 } 2590 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2591 } 2592 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2593 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2594 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2595 c->maxnz = c->nz; 2596 c->nonzerorowcnt = 0; 2597 c->rmax = 0; 2598 for (k = 0; k < m; k++) { 2599 const PetscInt nn = c->i[k+1] - c->i[k]; 2600 c->ilen[k] = c->imax[k] = nn; 2601 c->nonzerorowcnt += (PetscInt)!!nn; 2602 c->rmax = PetscMax(c->rmax,nn); 2603 } 2604 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2605 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2606 Ccsr->num_entries = c->nz; 2607 2608 C->nonzerostate++; 2609 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2610 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2611 Ccusp->nonzerostate = C->nonzerostate; 2612 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2613 C->preallocated = PETSC_TRUE; 2614 C->assembled = PETSC_FALSE; 2615 C->was_assembled = PETSC_FALSE; 2616 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2617 mmdata->reusesym = PETSC_TRUE; 2618 C->offloadmask = PETSC_OFFLOAD_GPU; 2619 } 2620 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2621 PetscFunctionReturn(0); 2622 } 2623 2624 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2625 2626 /* handles sparse or dense B */ 2627 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2628 { 2629 Mat_Product *product = mat->product; 2630 PetscErrorCode ierr; 2631 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2632 2633 PetscFunctionBegin; 2634 MatCheckProduct(mat,1); 2635 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2636 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2637 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2638 } 2639 if (product->type == MATPRODUCT_ABC) { 2640 Ciscusp = PETSC_FALSE; 2641 if (!product->C->boundtocpu) { 2642 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2643 } 2644 } 2645 if (isdense) { 2646 switch (product->type) { 2647 case MATPRODUCT_AB: 2648 case MATPRODUCT_AtB: 2649 case MATPRODUCT_ABt: 2650 case MATPRODUCT_PtAP: 2651 case MATPRODUCT_RARt: 2652 if (product->A->boundtocpu) { 2653 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2654 } else { 2655 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2656 } 2657 break; 2658 case MATPRODUCT_ABC: 2659 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2660 break; 2661 default: 2662 break; 2663 } 2664 } else if (Biscusp && Ciscusp) { 2665 switch (product->type) { 2666 case MATPRODUCT_AB: 2667 case MATPRODUCT_AtB: 2668 case MATPRODUCT_ABt: 2669 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2670 break; 2671 case MATPRODUCT_PtAP: 2672 case MATPRODUCT_RARt: 2673 case MATPRODUCT_ABC: 2674 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2675 break; 2676 default: 2677 break; 2678 } 2679 } else { /* fallback for AIJ */ 2680 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2681 } 2682 PetscFunctionReturn(0); 2683 } 2684 2685 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2686 { 2687 PetscErrorCode ierr; 2688 2689 PetscFunctionBegin; 2690 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2691 PetscFunctionReturn(0); 2692 } 2693 2694 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2695 { 2696 PetscErrorCode ierr; 2697 2698 PetscFunctionBegin; 2699 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2700 PetscFunctionReturn(0); 2701 } 2702 2703 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2704 { 2705 PetscErrorCode ierr; 2706 2707 PetscFunctionBegin; 2708 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2709 PetscFunctionReturn(0); 2710 } 2711 2712 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2713 { 2714 PetscErrorCode ierr; 2715 2716 PetscFunctionBegin; 2717 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2718 PetscFunctionReturn(0); 2719 } 2720 2721 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2722 { 2723 PetscErrorCode ierr; 2724 2725 PetscFunctionBegin; 2726 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2727 PetscFunctionReturn(0); 2728 } 2729 2730 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2731 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2732 { 2733 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2734 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2735 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2736 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2737 PetscErrorCode ierr; 2738 cudaError_t cerr; 2739 cusparseStatus_t stat; 2740 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2741 PetscBool compressed; 2742 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2743 PetscInt nx,ny; 2744 #endif 2745 2746 PetscFunctionBegin; 2747 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2748 if (!a->nonzerorowcnt) { 2749 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2750 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2751 PetscFunctionReturn(0); 2752 } 2753 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2754 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2755 if (!trans) { 2756 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2757 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2758 } else { 2759 if (herm || !cusparsestruct->transgen) { 2760 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2761 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2762 } else { 2763 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);} 2764 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2765 } 2766 } 2767 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2768 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2769 2770 try { 2771 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2772 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2773 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2774 2775 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2776 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2777 /* z = A x + beta y. 2778 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2779 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2780 */ 2781 xptr = xarray; 2782 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2783 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2784 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2785 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2786 allocated to accommodate different uses. So we get the length info directly from mat. 2787 */ 2788 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2789 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2790 nx = mat->num_cols; 2791 ny = mat->num_rows; 2792 } 2793 #endif 2794 } else { 2795 /* z = A^T x + beta y 2796 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2797 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2798 */ 2799 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2800 dptr = zarray; 2801 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2802 if (compressed) { /* Scatter x to work vector */ 2803 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2804 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2805 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2806 VecCUDAEqualsReverse()); 2807 } 2808 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2809 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2810 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2811 nx = mat->num_rows; 2812 ny = mat->num_cols; 2813 } 2814 #endif 2815 } 2816 2817 /* csr_spmv does y = alpha op(A) x + beta y */ 2818 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2819 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2820 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2821 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2822 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2823 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2824 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2825 matstruct->matDescr, 2826 matstruct->cuSpMV[opA].vecXDescr, beta, 2827 matstruct->cuSpMV[opA].vecYDescr, 2828 cusparse_scalartype, 2829 cusparsestruct->spmvAlg, 2830 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2831 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2832 2833 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2834 } else { 2835 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2836 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2837 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2838 } 2839 2840 stat = cusparseSpMV(cusparsestruct->handle, opA, 2841 matstruct->alpha_one, 2842 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */ 2843 matstruct->cuSpMV[opA].vecXDescr, 2844 beta, 2845 matstruct->cuSpMV[opA].vecYDescr, 2846 cusparse_scalartype, 2847 cusparsestruct->spmvAlg, 2848 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2849 #else 2850 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2851 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2852 mat->num_rows, mat->num_cols, 2853 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2854 mat->values->data().get(), mat->row_offsets->data().get(), 2855 mat->column_indices->data().get(), xptr, beta, 2856 dptr);CHKERRCUSPARSE(stat); 2857 #endif 2858 } else { 2859 if (cusparsestruct->nrows) { 2860 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2861 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2862 #else 2863 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2864 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2865 matstruct->alpha_one, matstruct->descr, hybMat, 2866 xptr, beta, 2867 dptr);CHKERRCUSPARSE(stat); 2868 #endif 2869 } 2870 } 2871 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2872 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2873 2874 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2875 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2876 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2877 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2878 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2879 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2880 } 2881 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2882 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2883 } 2884 2885 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2886 if (compressed) { 2887 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2888 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2889 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2890 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2891 VecCUDAPlusEquals()); 2892 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2893 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2894 } 2895 } else { 2896 if (yy && yy != zz) { 2897 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2898 } 2899 } 2900 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2901 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2902 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2903 } catch(char *ex) { 2904 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2905 } 2906 if (yy) { 2907 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2908 } else { 2909 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2910 } 2911 PetscFunctionReturn(0); 2912 } 2913 2914 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2915 { 2916 PetscErrorCode ierr; 2917 2918 PetscFunctionBegin; 2919 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2920 PetscFunctionReturn(0); 2921 } 2922 2923 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2924 { 2925 PetscErrorCode ierr; 2926 PetscSplitCSRDataStructure *d_mat = NULL; 2927 PetscFunctionBegin; 2928 if (A->factortype == MAT_FACTOR_NONE) { 2929 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2930 } 2931 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2932 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2933 if (d_mat) { 2934 A->offloadmask = PETSC_OFFLOAD_GPU; 2935 } 2936 2937 PetscFunctionReturn(0); 2938 } 2939 2940 /* --------------------------------------------------------------------------------*/ 2941 /*@ 2942 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2943 (the default parallel PETSc format). This matrix will ultimately pushed down 2944 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2945 assembly performance the user should preallocate the matrix storage by setting 2946 the parameter nz (or the array nnz). By setting these parameters accurately, 2947 performance during matrix assembly can be increased by more than a factor of 50. 2948 2949 Collective 2950 2951 Input Parameters: 2952 + comm - MPI communicator, set to PETSC_COMM_SELF 2953 . m - number of rows 2954 . n - number of columns 2955 . nz - number of nonzeros per row (same for all rows) 2956 - nnz - array containing the number of nonzeros in the various rows 2957 (possibly different for each row) or NULL 2958 2959 Output Parameter: 2960 . A - the matrix 2961 2962 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2963 MatXXXXSetPreallocation() paradgm instead of this routine directly. 2964 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 2965 2966 Notes: 2967 If nnz is given then nz is ignored 2968 2969 The AIJ format (also called the Yale sparse matrix format or 2970 compressed row storage), is fully compatible with standard Fortran 77 2971 storage. That is, the stored row and column indices can begin at 2972 either one (as in Fortran) or zero. See the users' manual for details. 2973 2974 Specify the preallocated storage with either nz or nnz (not both). 2975 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 2976 allocation. For large problems you MUST preallocate memory or you 2977 will get TERRIBLE performance, see the users' manual chapter on matrices. 2978 2979 By default, this format uses inodes (identical nodes) when possible, to 2980 improve numerical efficiency of matrix-vector products and solves. We 2981 search for consecutive rows with the same nonzero structure, thereby 2982 reusing matrix information to achieve increased efficiency. 2983 2984 Level: intermediate 2985 2986 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 2987 @*/ 2988 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 2989 { 2990 PetscErrorCode ierr; 2991 2992 PetscFunctionBegin; 2993 ierr = MatCreate(comm,A);CHKERRQ(ierr); 2994 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 2995 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2996 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 2997 PetscFunctionReturn(0); 2998 } 2999 3000 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3001 { 3002 PetscErrorCode ierr; 3003 PetscSplitCSRDataStructure *d_mat = NULL; 3004 3005 PetscFunctionBegin; 3006 if (A->factortype == MAT_FACTOR_NONE) { 3007 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 3008 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 3009 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3010 } else { 3011 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3012 } 3013 if (d_mat) { 3014 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3015 cudaError_t err; 3016 PetscSplitCSRDataStructure h_mat; 3017 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 3018 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 3019 if (a->compressedrow.use) { 3020 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 3021 } 3022 err = cudaFree(d_mat);CHKERRCUDA(err); 3023 } 3024 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3025 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3026 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3027 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3028 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3029 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3030 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3031 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3032 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3033 PetscFunctionReturn(0); 3034 } 3035 3036 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3037 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3038 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3039 { 3040 PetscErrorCode ierr; 3041 3042 PetscFunctionBegin; 3043 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3044 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3045 PetscFunctionReturn(0); 3046 } 3047 3048 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3049 { 3050 PetscErrorCode ierr; 3051 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3052 Mat_SeqAIJCUSPARSE *cy; 3053 Mat_SeqAIJCUSPARSE *cx; 3054 PetscScalar *ay; 3055 const PetscScalar *ax; 3056 CsrMatrix *csry,*csrx; 3057 cudaError_t cerr; 3058 3059 PetscFunctionBegin; 3060 if (X->ops->axpy != Y->ops->axpy) { 3061 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3062 PetscFunctionReturn(0); 3063 } 3064 /* if we are here, it means both matrices are bound to GPU */ 3065 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3066 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3067 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3068 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3069 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3070 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3071 csry = (CsrMatrix*)cy->mat->mat; 3072 csrx = (CsrMatrix*)cx->mat->mat; 3073 /* see if we can turn this into a cublas axpy */ 3074 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3075 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3076 if (eq) { 3077 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3078 } 3079 if (eq) str = SAME_NONZERO_PATTERN; 3080 } 3081 3082 if (str == SUBSET_NONZERO_PATTERN) { 3083 cusparseStatus_t stat; 3084 PetscScalar b = 1.0; 3085 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3086 size_t bufferSize; 3087 void *buffer; 3088 #endif 3089 3090 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3091 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3092 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3093 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3094 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3095 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3096 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3097 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3098 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3099 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3100 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3101 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3102 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3103 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3104 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3105 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3106 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3107 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3108 #else 3109 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3110 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3111 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3112 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3113 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3114 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3115 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3116 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3117 #endif 3118 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3119 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3120 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3121 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3122 } else if (str == SAME_NONZERO_PATTERN) { 3123 cublasHandle_t cublasv2handle; 3124 cublasStatus_t berr; 3125 PetscBLASInt one = 1, bnz = 1; 3126 3127 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3128 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3129 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3130 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3131 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3132 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3133 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3134 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3135 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3136 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3137 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3138 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3139 } else { 3140 ierr = MatAXPY_SeqAIJ(Y,a,X,DIFFERENT_NONZERO_PATTERN);CHKERRQ(ierr); 3141 } 3142 PetscFunctionReturn(0); 3143 } 3144 3145 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3146 { 3147 PetscErrorCode ierr; 3148 PetscBool both = PETSC_FALSE; 3149 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3150 3151 PetscFunctionBegin; 3152 if (A->factortype == MAT_FACTOR_NONE) { 3153 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3154 if (spptr->mat) { 3155 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3156 if (matrix->values) { 3157 both = PETSC_TRUE; 3158 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3159 } 3160 } 3161 if (spptr->matTranspose) { 3162 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3163 if (matrix->values) { 3164 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3165 } 3166 } 3167 } 3168 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3169 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3170 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3171 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3172 else A->offloadmask = PETSC_OFFLOAD_CPU; 3173 3174 PetscFunctionReturn(0); 3175 } 3176 3177 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3178 { 3179 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3180 PetscErrorCode ierr; 3181 3182 PetscFunctionBegin; 3183 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3184 if (flg) { 3185 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3186 3187 A->ops->axpy = MatAXPY_SeqAIJ; 3188 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3189 A->ops->mult = MatMult_SeqAIJ; 3190 A->ops->multadd = MatMultAdd_SeqAIJ; 3191 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3192 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3193 A->ops->multhermitiantranspose = NULL; 3194 A->ops->multhermitiantransposeadd = NULL; 3195 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3196 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3197 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3198 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3199 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3200 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3201 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3202 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3203 } else { 3204 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3205 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3206 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3207 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3208 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3209 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3210 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3211 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3212 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3213 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3214 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3215 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3216 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3217 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3218 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3219 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3220 } 3221 A->boundtocpu = flg; 3222 a->inode.use = flg; 3223 PetscFunctionReturn(0); 3224 } 3225 3226 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3227 { 3228 PetscErrorCode ierr; 3229 cusparseStatus_t stat; 3230 Mat B; 3231 3232 PetscFunctionBegin; 3233 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3234 if (reuse == MAT_INITIAL_MATRIX) { 3235 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3236 } else if (reuse == MAT_REUSE_MATRIX) { 3237 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3238 } 3239 B = *newmat; 3240 3241 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3242 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3243 3244 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3245 if (B->factortype == MAT_FACTOR_NONE) { 3246 Mat_SeqAIJCUSPARSE *spptr; 3247 3248 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3249 spptr->format = MAT_CUSPARSE_CSR; 3250 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3251 B->spptr = spptr; 3252 spptr->deviceMat = NULL; 3253 } else { 3254 Mat_SeqAIJCUSPARSETriFactors *spptr; 3255 3256 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3257 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3258 B->spptr = spptr; 3259 } 3260 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3261 } 3262 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3263 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3264 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3265 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3266 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3267 3268 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3269 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3270 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3271 PetscFunctionReturn(0); 3272 } 3273 3274 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3275 { 3276 PetscErrorCode ierr; 3277 3278 PetscFunctionBegin; 3279 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3280 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3281 ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr); 3282 ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr); 3283 ierr = PetscOptionsEnd();CHKERRQ(ierr); 3284 PetscFunctionReturn(0); 3285 } 3286 3287 /*MC 3288 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3289 3290 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3291 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3292 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3293 3294 Options Database Keys: 3295 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3296 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3297 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3298 3299 Level: beginner 3300 3301 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3302 M*/ 3303 3304 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 3305 3306 3307 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3308 { 3309 PetscErrorCode ierr; 3310 3311 PetscFunctionBegin; 3312 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3313 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3314 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3315 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3316 PetscFunctionReturn(0); 3317 } 3318 3319 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3320 { 3321 PetscErrorCode ierr; 3322 cusparseStatus_t stat; 3323 3324 PetscFunctionBegin; 3325 if (*cusparsestruct) { 3326 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3327 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3328 delete (*cusparsestruct)->workVector; 3329 delete (*cusparsestruct)->rowoffsets_gpu; 3330 delete (*cusparsestruct)->cooPerm; 3331 delete (*cusparsestruct)->cooPerm_a; 3332 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3333 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3334 cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr); 3335 #endif 3336 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3337 } 3338 PetscFunctionReturn(0); 3339 } 3340 3341 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3342 { 3343 PetscFunctionBegin; 3344 if (*mat) { 3345 delete (*mat)->values; 3346 delete (*mat)->column_indices; 3347 delete (*mat)->row_offsets; 3348 delete *mat; 3349 *mat = 0; 3350 } 3351 PetscFunctionReturn(0); 3352 } 3353 3354 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3355 { 3356 cusparseStatus_t stat; 3357 PetscErrorCode ierr; 3358 3359 PetscFunctionBegin; 3360 if (*trifactor) { 3361 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3362 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3363 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3364 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3365 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3366 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3367 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3368 #endif 3369 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3370 } 3371 PetscFunctionReturn(0); 3372 } 3373 3374 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3375 { 3376 CsrMatrix *mat; 3377 cusparseStatus_t stat; 3378 cudaError_t err; 3379 3380 PetscFunctionBegin; 3381 if (*matstruct) { 3382 if ((*matstruct)->mat) { 3383 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3384 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3385 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3386 #else 3387 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3388 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3389 #endif 3390 } else { 3391 mat = (CsrMatrix*)(*matstruct)->mat; 3392 CsrMatrix_Destroy(&mat); 3393 } 3394 } 3395 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3396 delete (*matstruct)->cprowIndices; 3397 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3398 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3399 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3400 3401 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3402 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3403 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3404 for (int i=0; i<3; i++) { 3405 if (mdata->cuSpMV[i].initialized) { 3406 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3407 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3408 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3409 } 3410 } 3411 #endif 3412 delete *matstruct; 3413 *matstruct = NULL; 3414 } 3415 PetscFunctionReturn(0); 3416 } 3417 3418 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3419 { 3420 PetscErrorCode ierr; 3421 3422 PetscFunctionBegin; 3423 if (*trifactors) { 3424 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3425 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3426 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3427 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3428 delete (*trifactors)->rpermIndices; 3429 delete (*trifactors)->cpermIndices; 3430 delete (*trifactors)->workVector; 3431 (*trifactors)->rpermIndices = NULL; 3432 (*trifactors)->cpermIndices = NULL; 3433 (*trifactors)->workVector = NULL; 3434 } 3435 PetscFunctionReturn(0); 3436 } 3437 3438 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3439 { 3440 PetscErrorCode ierr; 3441 cusparseHandle_t handle; 3442 cusparseStatus_t stat; 3443 3444 PetscFunctionBegin; 3445 if (*trifactors) { 3446 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3447 if (handle = (*trifactors)->handle) { 3448 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3449 } 3450 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3451 } 3452 PetscFunctionReturn(0); 3453 } 3454 3455 struct IJCompare 3456 { 3457 __host__ __device__ 3458 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3459 { 3460 if (t1.get<0>() < t2.get<0>()) return true; 3461 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3462 return false; 3463 } 3464 }; 3465 3466 struct IJEqual 3467 { 3468 __host__ __device__ 3469 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3470 { 3471 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3472 return true; 3473 } 3474 }; 3475 3476 struct IJDiff 3477 { 3478 __host__ __device__ 3479 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3480 { 3481 return t1 == t2 ? 0 : 1; 3482 } 3483 }; 3484 3485 struct IJSum 3486 { 3487 __host__ __device__ 3488 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3489 { 3490 return t1||t2; 3491 } 3492 }; 3493 3494 #include <thrust/iterator/discard_iterator.h> 3495 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3496 { 3497 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3498 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3499 THRUSTARRAY *cooPerm_v = NULL; 3500 thrust::device_ptr<const PetscScalar> d_v; 3501 CsrMatrix *matrix; 3502 PetscErrorCode ierr; 3503 cudaError_t cerr; 3504 PetscInt n; 3505 3506 PetscFunctionBegin; 3507 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3508 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3509 if (!cusp->cooPerm) { 3510 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3511 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3512 PetscFunctionReturn(0); 3513 } 3514 matrix = (CsrMatrix*)cusp->mat->mat; 3515 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3516 if (!v) { 3517 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3518 goto finalize; 3519 } 3520 n = cusp->cooPerm->size(); 3521 if (isCudaMem(v)) { 3522 d_v = thrust::device_pointer_cast(v); 3523 } else { 3524 cooPerm_v = new THRUSTARRAY(n); 3525 cooPerm_v->assign(v,v+n); 3526 d_v = cooPerm_v->data(); 3527 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3528 } 3529 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3530 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3531 if (cusp->cooPerm_a) { 3532 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3533 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3534 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3535 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3536 delete cooPerm_w; 3537 } else { 3538 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3539 matrix->values->begin())); 3540 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3541 matrix->values->end())); 3542 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 3543 } 3544 } else { 3545 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3546 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3547 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3548 } else { 3549 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3550 matrix->values->begin())); 3551 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3552 matrix->values->end())); 3553 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3554 } 3555 } 3556 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3557 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3558 finalize: 3559 delete cooPerm_v; 3560 A->offloadmask = PETSC_OFFLOAD_GPU; 3561 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3562 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3563 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3564 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3565 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3566 a->reallocs = 0; 3567 A->info.mallocs += 0; 3568 A->info.nz_unneeded = 0; 3569 A->assembled = A->was_assembled = PETSC_TRUE; 3570 A->num_ass++; 3571 PetscFunctionReturn(0); 3572 } 3573 3574 #include <thrust/binary_search.h> 3575 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3576 { 3577 PetscErrorCode ierr; 3578 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3579 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3580 PetscInt cooPerm_n, nzr = 0; 3581 cudaError_t cerr; 3582 3583 PetscFunctionBegin; 3584 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3585 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3586 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3587 if (n != cooPerm_n) { 3588 delete cusp->cooPerm; 3589 delete cusp->cooPerm_a; 3590 cusp->cooPerm = NULL; 3591 cusp->cooPerm_a = NULL; 3592 } 3593 if (n) { 3594 THRUSTINTARRAY d_i(n); 3595 THRUSTINTARRAY d_j(n); 3596 THRUSTINTARRAY ii(A->rmap->n); 3597 3598 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3599 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3600 3601 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3602 d_i.assign(coo_i,coo_i+n); 3603 d_j.assign(coo_j,coo_j+n); 3604 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3605 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3606 3607 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3608 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3609 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 3610 *cusp->cooPerm_a = d_i; 3611 THRUSTINTARRAY w = d_j; 3612 3613 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 3614 if (nekey == ekey) { /* all entries are unique */ 3615 delete cusp->cooPerm_a; 3616 cusp->cooPerm_a = NULL; 3617 } else { /* I couldn't come up with a more elegant algorithm */ 3618 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 3619 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 3620 (*cusp->cooPerm_a)[0] = 0; 3621 w[0] = 0; 3622 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 3623 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 3624 } 3625 thrust::counting_iterator<PetscInt> search_begin(0); 3626 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 3627 search_begin, search_begin + A->rmap->n, 3628 ii.begin()); 3629 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3630 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3631 3632 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3633 a->singlemalloc = PETSC_FALSE; 3634 a->free_a = PETSC_TRUE; 3635 a->free_ij = PETSC_TRUE; 3636 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3637 a->i[0] = 0; 3638 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3639 a->nz = a->maxnz = a->i[A->rmap->n]; 3640 a->rmax = 0; 3641 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3642 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3643 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3644 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3645 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3646 for (PetscInt i = 0; i < A->rmap->n; i++) { 3647 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3648 nzr += (PetscInt)!!(nnzr); 3649 a->ilen[i] = a->imax[i] = nnzr; 3650 a->rmax = PetscMax(a->rmax,nnzr); 3651 } 3652 a->nonzerorowcnt = nzr; 3653 A->preallocated = PETSC_TRUE; 3654 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3655 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3656 } else { 3657 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3658 } 3659 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3660 3661 /* We want to allocate the CUSPARSE struct for matvec now. 3662 The code is so convoluted now that I prefer to copy zeros */ 3663 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3664 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3665 A->offloadmask = PETSC_OFFLOAD_CPU; 3666 A->nonzerostate++; 3667 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3668 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3669 3670 A->assembled = PETSC_FALSE; 3671 A->was_assembled = PETSC_FALSE; 3672 PetscFunctionReturn(0); 3673 } 3674 3675 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 3676 { 3677 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3678 CsrMatrix *csr; 3679 PetscErrorCode ierr; 3680 3681 PetscFunctionBegin; 3682 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3683 PetscValidPointer(a,2); 3684 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3685 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3686 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3687 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3688 csr = (CsrMatrix*)cusp->mat->mat; 3689 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3690 *a = csr->values->data().get(); 3691 PetscFunctionReturn(0); 3692 } 3693 3694 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 3695 { 3696 PetscFunctionBegin; 3697 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3698 PetscValidPointer(a,2); 3699 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3700 *a = NULL; 3701 PetscFunctionReturn(0); 3702 } 3703 3704 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 3705 { 3706 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3707 CsrMatrix *csr; 3708 PetscErrorCode ierr; 3709 3710 PetscFunctionBegin; 3711 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3712 PetscValidPointer(a,2); 3713 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3714 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3715 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3716 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3717 csr = (CsrMatrix*)cusp->mat->mat; 3718 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3719 *a = csr->values->data().get(); 3720 A->offloadmask = PETSC_OFFLOAD_GPU; 3721 PetscFunctionReturn(0); 3722 } 3723 3724 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 3725 { 3726 PetscErrorCode ierr; 3727 3728 PetscFunctionBegin; 3729 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3730 PetscValidPointer(a,2); 3731 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3732 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3733 *a = NULL; 3734 PetscFunctionReturn(0); 3735 } 3736 3737 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 3738 { 3739 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3740 CsrMatrix *csr; 3741 3742 PetscFunctionBegin; 3743 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3744 PetscValidPointer(a,2); 3745 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3746 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3747 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3748 csr = (CsrMatrix*)cusp->mat->mat; 3749 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3750 *a = csr->values->data().get(); 3751 A->offloadmask = PETSC_OFFLOAD_GPU; 3752 PetscFunctionReturn(0); 3753 } 3754 3755 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 3756 { 3757 PetscErrorCode ierr; 3758 3759 PetscFunctionBegin; 3760 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3761 PetscValidPointer(a,2); 3762 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3763 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3764 *a = NULL; 3765 PetscFunctionReturn(0); 3766 } 3767 3768 struct IJCompare4 3769 { 3770 __host__ __device__ 3771 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 3772 { 3773 if (t1.get<0>() < t2.get<0>()) return true; 3774 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3775 return false; 3776 } 3777 }; 3778 3779 struct Shift 3780 { 3781 int _shift; 3782 3783 Shift(int shift) : _shift(shift) {} 3784 __host__ __device__ 3785 inline int operator() (const int &c) 3786 { 3787 return c + _shift; 3788 } 3789 }; 3790 3791 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */ 3792 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 3793 { 3794 PetscErrorCode ierr; 3795 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 3796 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 3797 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 3798 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3799 PetscInt Annz,Bnnz; 3800 cusparseStatus_t stat; 3801 PetscInt i,m,n,zero = 0; 3802 cudaError_t cerr; 3803 3804 PetscFunctionBegin; 3805 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3806 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 3807 PetscValidPointer(C,4); 3808 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3809 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 3810 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 3811 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 3812 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3813 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3814 if (reuse == MAT_INITIAL_MATRIX) { 3815 m = A->rmap->n; 3816 n = A->cmap->n + B->cmap->n; 3817 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 3818 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 3819 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3820 c = (Mat_SeqAIJ*)(*C)->data; 3821 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3822 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3823 Ccsr = new CsrMatrix; 3824 Cmat->cprowIndices = NULL; 3825 c->compressedrow.use = PETSC_FALSE; 3826 c->compressedrow.nrows = 0; 3827 c->compressedrow.i = NULL; 3828 c->compressedrow.rindex = NULL; 3829 Ccusp->workVector = NULL; 3830 Ccusp->nrows = m; 3831 Ccusp->mat = Cmat; 3832 Ccusp->mat->mat = Ccsr; 3833 Ccsr->num_rows = m; 3834 Ccsr->num_cols = n; 3835 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 3836 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3837 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3838 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3839 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3840 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3841 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3842 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3843 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3844 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3845 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 3846 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 3847 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr); 3848 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3849 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3850 3851 Acsr = (CsrMatrix*)Acusp->mat->mat; 3852 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 3853 Annz = (PetscInt)Acsr->column_indices->size(); 3854 Bnnz = (PetscInt)Bcsr->column_indices->size(); 3855 c->nz = Annz + Bnnz; 3856 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 3857 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3858 Ccsr->values = new THRUSTARRAY(c->nz); 3859 Ccsr->num_entries = c->nz; 3860 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 3861 if (c->nz) { 3862 auto Acoo = new THRUSTINTARRAY32(Annz); 3863 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 3864 auto Ccoo = new THRUSTINTARRAY32(c->nz); 3865 THRUSTINTARRAY32 *Aroff,*Broff; 3866 3867 if (a->compressedrow.use) { /* need full row offset */ 3868 if (!Acusp->rowoffsets_gpu) { 3869 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 3870 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 3871 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3872 } 3873 Aroff = Acusp->rowoffsets_gpu; 3874 } else Aroff = Acsr->row_offsets; 3875 if (b->compressedrow.use) { /* need full row offset */ 3876 if (!Bcusp->rowoffsets_gpu) { 3877 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3878 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 3879 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3880 } 3881 Broff = Bcusp->rowoffsets_gpu; 3882 } else Broff = Bcsr->row_offsets; 3883 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3884 stat = cusparseXcsr2coo(Acusp->handle, 3885 Aroff->data().get(), 3886 Annz, 3887 m, 3888 Acoo->data().get(), 3889 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3890 stat = cusparseXcsr2coo(Bcusp->handle, 3891 Broff->data().get(), 3892 Bnnz, 3893 m, 3894 Bcoo->data().get(), 3895 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3896 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 3897 auto Aperm = thrust::make_constant_iterator(1); 3898 auto Bperm = thrust::make_constant_iterator(0); 3899 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 3900 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 3901 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 3902 #else 3903 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 3904 auto Bcib = Bcsr->column_indices->begin(); 3905 auto Bcie = Bcsr->column_indices->end(); 3906 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 3907 #endif 3908 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 3909 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 3910 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 3911 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 3912 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 3913 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 3914 auto p1 = Ccusp->cooPerm->begin(); 3915 auto p2 = Ccusp->cooPerm->begin(); 3916 thrust::advance(p2,Annz); 3917 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 3918 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 3919 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 3920 #endif 3921 auto cci = thrust::make_counting_iterator(zero); 3922 auto cce = thrust::make_counting_iterator(c->nz); 3923 #if 0 //Errors on SUMMIT cuda 11.1.0 3924 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 3925 #else 3926 auto pred = thrust::identity<int>(); 3927 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 3928 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 3929 #endif 3930 stat = cusparseXcoo2csr(Ccusp->handle, 3931 Ccoo->data().get(), 3932 c->nz, 3933 m, 3934 Ccsr->row_offsets->data().get(), 3935 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3936 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3937 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3938 delete wPerm; 3939 delete Acoo; 3940 delete Bcoo; 3941 delete Ccoo; 3942 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3943 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 3944 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 3945 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 3946 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 3947 #endif 3948 if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */ 3949 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 3950 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 3951 CsrMatrix *CcsrT = new CsrMatrix; 3952 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 3953 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 3954 3955 Ccusp->transgen = PETSC_TRUE; 3956 CmatT->cprowIndices = NULL; 3957 CmatT->mat = CcsrT; 3958 CcsrT->num_rows = n; 3959 CcsrT->num_cols = m; 3960 CcsrT->num_entries = c->nz; 3961 3962 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 3963 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 3964 CcsrT->values = new THRUSTARRAY(c->nz); 3965 3966 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3967 auto rT = CcsrT->row_offsets->begin(); 3968 if (AT) { 3969 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 3970 thrust::advance(rT,-1); 3971 } 3972 if (BT) { 3973 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 3974 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 3975 thrust::copy(titb,tite,rT); 3976 } 3977 auto cT = CcsrT->column_indices->begin(); 3978 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 3979 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 3980 auto vT = CcsrT->values->begin(); 3981 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 3982 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 3983 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3984 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3985 3986 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 3987 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3988 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3989 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3990 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3991 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3992 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3993 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3994 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3995 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3996 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 3997 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 3998 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 3999 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4000 #endif 4001 Ccusp->matTranspose = CmatT; 4002 } 4003 } 4004 4005 c->singlemalloc = PETSC_FALSE; 4006 c->free_a = PETSC_TRUE; 4007 c->free_ij = PETSC_TRUE; 4008 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4009 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4010 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4011 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4012 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4013 ii = *Ccsr->row_offsets; 4014 jj = *Ccsr->column_indices; 4015 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4016 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4017 } else { 4018 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4019 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4020 } 4021 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4022 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4023 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4024 c->maxnz = c->nz; 4025 c->nonzerorowcnt = 0; 4026 c->rmax = 0; 4027 for (i = 0; i < m; i++) { 4028 const PetscInt nn = c->i[i+1] - c->i[i]; 4029 c->ilen[i] = c->imax[i] = nn; 4030 c->nonzerorowcnt += (PetscInt)!!nn; 4031 c->rmax = PetscMax(c->rmax,nn); 4032 } 4033 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4034 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4035 (*C)->nonzerostate++; 4036 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4037 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4038 Ccusp->nonzerostate = (*C)->nonzerostate; 4039 (*C)->preallocated = PETSC_TRUE; 4040 } else { 4041 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4042 c = (Mat_SeqAIJ*)(*C)->data; 4043 if (c->nz) { 4044 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4045 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4046 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4047 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4048 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4049 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4050 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4051 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4052 Acsr = (CsrMatrix*)Acusp->mat->mat; 4053 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4054 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4055 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4056 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4057 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4058 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4059 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4060 auto pmid = Ccusp->cooPerm->begin(); 4061 thrust::advance(pmid,Acsr->num_entries); 4062 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4063 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4064 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4065 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4066 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4067 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4068 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4069 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4070 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4071 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4072 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4073 if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) { 4074 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4075 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4076 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4077 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4078 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4079 auto vT = CcsrT->values->begin(); 4080 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4081 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4082 } 4083 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4084 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4085 } 4086 } 4087 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4088 (*C)->assembled = PETSC_TRUE; 4089 (*C)->was_assembled = PETSC_FALSE; 4090 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4091 PetscFunctionReturn(0); 4092 } 4093 4094 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4095 { 4096 PetscErrorCode ierr; 4097 bool dmem; 4098 const PetscScalar *av; 4099 cudaError_t cerr; 4100 4101 PetscFunctionBegin; 4102 dmem = isCudaMem(v); 4103 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4104 if (n && idx) { 4105 THRUSTINTARRAY widx(n); 4106 widx.assign(idx,idx+n); 4107 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4108 4109 THRUSTARRAY *w = NULL; 4110 thrust::device_ptr<PetscScalar> dv; 4111 if (dmem) { 4112 dv = thrust::device_pointer_cast(v); 4113 } else { 4114 w = new THRUSTARRAY(n); 4115 dv = w->data(); 4116 } 4117 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4118 4119 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4120 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4121 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4122 if (w) { 4123 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4124 } 4125 delete w; 4126 } else { 4127 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4128 } 4129 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4130 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4131 PetscFunctionReturn(0); 4132 } 4133