1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 76 77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 81 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 82 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 83 84 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 85 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 86 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 87 88 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 89 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 90 91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 92 93 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 94 { 95 cusparseStatus_t stat; 96 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 97 98 PetscFunctionBegin; 99 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 100 cusparsestruct->stream = stream; 101 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 102 PetscFunctionReturn(0); 103 } 104 105 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 106 { 107 cusparseStatus_t stat; 108 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 109 110 PetscFunctionBegin; 111 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 112 if (cusparsestruct->handle != handle) { 113 if (cusparsestruct->handle) { 114 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 115 } 116 cusparsestruct->handle = handle; 117 } 118 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 119 PetscFunctionReturn(0); 120 } 121 122 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 123 { 124 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 125 PetscBool flg; 126 PetscErrorCode ierr; 127 128 PetscFunctionBegin; 129 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 130 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 131 if (cusparsestruct->handle) cusparsestruct->handle = 0; 132 PetscFunctionReturn(0); 133 } 134 135 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 136 { 137 PetscFunctionBegin; 138 *type = MATSOLVERCUSPARSE; 139 PetscFunctionReturn(0); 140 } 141 142 /*MC 143 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 144 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 145 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 146 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 147 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 148 algorithms are not recommended. This class does NOT support direct solver operations. 149 150 Level: beginner 151 152 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 153 M*/ 154 155 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 156 { 157 PetscErrorCode ierr; 158 PetscInt n = A->rmap->n; 159 160 PetscFunctionBegin; 161 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 162 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 163 (*B)->factortype = ftype; 164 (*B)->useordering = PETSC_TRUE; 165 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 166 167 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 168 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 169 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 170 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 171 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 172 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 173 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 174 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 175 176 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 177 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 178 PetscFunctionReturn(0); 179 } 180 181 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 182 { 183 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 184 185 PetscFunctionBegin; 186 switch (op) { 187 case MAT_CUSPARSE_MULT: 188 cusparsestruct->format = format; 189 break; 190 case MAT_CUSPARSE_ALL: 191 cusparsestruct->format = format; 192 break; 193 default: 194 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 195 } 196 PetscFunctionReturn(0); 197 } 198 199 /*@ 200 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 201 operation. Only the MatMult operation can use different GPU storage formats 202 for MPIAIJCUSPARSE matrices. 203 Not Collective 204 205 Input Parameters: 206 + A - Matrix of type SEQAIJCUSPARSE 207 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 208 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 209 210 Output Parameter: 211 212 Level: intermediate 213 214 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 215 @*/ 216 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 217 { 218 PetscErrorCode ierr; 219 220 PetscFunctionBegin; 221 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 222 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 223 PetscFunctionReturn(0); 224 } 225 226 /*@ 227 MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose 228 229 Collective on mat 230 231 Input Parameters: 232 + A - Matrix of type SEQAIJCUSPARSE 233 - transgen - the boolean flag 234 235 Level: intermediate 236 237 .seealso: MATSEQAIJCUSPARSE, MatAIJCUSPARSESetGenerateTranspose() 238 @*/ 239 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 240 { 241 PetscErrorCode ierr; 242 PetscBool flg; 243 244 PetscFunctionBegin; 245 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 246 ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 247 if (flg) { 248 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 249 250 if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 251 cusp->transgen = transgen; 252 if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 253 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 254 } 255 } 256 PetscFunctionReturn(0); 257 } 258 259 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 260 { 261 PetscErrorCode ierr; 262 MatCUSPARSEStorageFormat format; 263 PetscBool flg; 264 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 265 266 PetscFunctionBegin; 267 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 268 if (A->factortype == MAT_FACTOR_NONE) { 269 PetscBool transgen = cusparsestruct->transgen; 270 271 ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 272 if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);} 273 274 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 275 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 276 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 277 278 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 279 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 280 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 281 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 282 cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 283 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 284 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 285 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 286 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 287 288 cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 289 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 290 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 291 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 292 293 cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 294 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 295 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 296 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 297 #endif 298 } 299 ierr = PetscOptionsTail();CHKERRQ(ierr); 300 PetscFunctionReturn(0); 301 } 302 303 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 304 { 305 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 306 PetscErrorCode ierr; 307 308 PetscFunctionBegin; 309 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 310 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 311 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 312 PetscFunctionReturn(0); 313 } 314 315 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 316 { 317 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 318 PetscErrorCode ierr; 319 320 PetscFunctionBegin; 321 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 322 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 323 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 324 PetscFunctionReturn(0); 325 } 326 327 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 328 { 329 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 330 PetscErrorCode ierr; 331 332 PetscFunctionBegin; 333 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 334 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 335 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 336 PetscFunctionReturn(0); 337 } 338 339 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 340 { 341 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 342 PetscErrorCode ierr; 343 344 PetscFunctionBegin; 345 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 346 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 347 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 348 PetscFunctionReturn(0); 349 } 350 351 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 352 { 353 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 354 PetscInt n = A->rmap->n; 355 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 356 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 357 cusparseStatus_t stat; 358 const PetscInt *ai = a->i,*aj = a->j,*vi; 359 const MatScalar *aa = a->a,*v; 360 PetscInt *AiLo, *AjLo; 361 PetscInt i,nz, nzLower, offset, rowOffset; 362 PetscErrorCode ierr; 363 cudaError_t cerr; 364 365 PetscFunctionBegin; 366 if (!n) PetscFunctionReturn(0); 367 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 368 try { 369 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 370 nzLower=n+ai[n]-ai[1]; 371 if (!loTriFactor) { 372 PetscScalar *AALo; 373 374 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 375 376 /* Allocate Space for the lower triangular matrix */ 377 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 378 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 379 380 /* Fill the lower triangular matrix */ 381 AiLo[0] = (PetscInt) 0; 382 AiLo[n] = nzLower; 383 AjLo[0] = (PetscInt) 0; 384 AALo[0] = (MatScalar) 1.0; 385 v = aa; 386 vi = aj; 387 offset = 1; 388 rowOffset= 1; 389 for (i=1; i<n; i++) { 390 nz = ai[i+1] - ai[i]; 391 /* additional 1 for the term on the diagonal */ 392 AiLo[i] = rowOffset; 393 rowOffset += nz+1; 394 395 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 396 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 397 398 offset += nz; 399 AjLo[offset] = (PetscInt) i; 400 AALo[offset] = (MatScalar) 1.0; 401 offset += 1; 402 403 v += nz; 404 vi += nz; 405 } 406 407 /* allocate space for the triangular factor information */ 408 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 409 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 410 /* Create the matrix description */ 411 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 412 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 413 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 414 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 415 #else 416 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 417 #endif 418 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 419 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 420 421 /* set the operation */ 422 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 423 424 /* set the matrix */ 425 loTriFactor->csrMat = new CsrMatrix; 426 loTriFactor->csrMat->num_rows = n; 427 loTriFactor->csrMat->num_cols = n; 428 loTriFactor->csrMat->num_entries = nzLower; 429 430 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 431 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 432 433 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 434 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 435 436 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 437 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 438 439 /* Create the solve analysis information */ 440 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 441 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 442 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 443 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 444 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 445 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 446 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 447 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 448 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 449 #endif 450 451 /* perform the solve analysis */ 452 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 453 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 454 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 455 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 456 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 457 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 458 #endif 459 );CHKERRCUSPARSE(stat); 460 cerr = WaitForCUDA();CHKERRCUDA(cerr); 461 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 462 463 /* assign the pointer */ 464 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 465 loTriFactor->AA_h = AALo; 466 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 467 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 468 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 469 } else { /* update values only */ 470 if (!loTriFactor->AA_h) { 471 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 472 } 473 /* Fill the lower triangular matrix */ 474 loTriFactor->AA_h[0] = 1.0; 475 v = aa; 476 vi = aj; 477 offset = 1; 478 for (i=1; i<n; i++) { 479 nz = ai[i+1] - ai[i]; 480 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 481 offset += nz; 482 loTriFactor->AA_h[offset] = 1.0; 483 offset += 1; 484 v += nz; 485 } 486 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 487 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 488 } 489 } catch(char *ex) { 490 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 491 } 492 } 493 PetscFunctionReturn(0); 494 } 495 496 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 497 { 498 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 499 PetscInt n = A->rmap->n; 500 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 501 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 502 cusparseStatus_t stat; 503 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 504 const MatScalar *aa = a->a,*v; 505 PetscInt *AiUp, *AjUp; 506 PetscInt i,nz, nzUpper, offset; 507 PetscErrorCode ierr; 508 cudaError_t cerr; 509 510 PetscFunctionBegin; 511 if (!n) PetscFunctionReturn(0); 512 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 513 try { 514 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 515 nzUpper = adiag[0]-adiag[n]; 516 if (!upTriFactor) { 517 PetscScalar *AAUp; 518 519 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 520 521 /* Allocate Space for the upper triangular matrix */ 522 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 523 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 524 525 /* Fill the upper triangular matrix */ 526 AiUp[0]=(PetscInt) 0; 527 AiUp[n]=nzUpper; 528 offset = nzUpper; 529 for (i=n-1; i>=0; i--) { 530 v = aa + adiag[i+1] + 1; 531 vi = aj + adiag[i+1] + 1; 532 533 /* number of elements NOT on the diagonal */ 534 nz = adiag[i] - adiag[i+1]-1; 535 536 /* decrement the offset */ 537 offset -= (nz+1); 538 539 /* first, set the diagonal elements */ 540 AjUp[offset] = (PetscInt) i; 541 AAUp[offset] = (MatScalar)1./v[nz]; 542 AiUp[i] = AiUp[i+1] - (nz+1); 543 544 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 545 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 546 } 547 548 /* allocate space for the triangular factor information */ 549 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 550 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 551 552 /* Create the matrix description */ 553 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 554 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 555 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 556 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 557 #else 558 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 559 #endif 560 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 561 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 562 563 /* set the operation */ 564 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 565 566 /* set the matrix */ 567 upTriFactor->csrMat = new CsrMatrix; 568 upTriFactor->csrMat->num_rows = n; 569 upTriFactor->csrMat->num_cols = n; 570 upTriFactor->csrMat->num_entries = nzUpper; 571 572 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 573 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 574 575 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 576 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 577 578 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 579 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 580 581 /* Create the solve analysis information */ 582 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 583 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 584 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 585 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 586 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 587 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 588 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 589 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 590 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 591 #endif 592 593 /* perform the solve analysis */ 594 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 595 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 596 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 597 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 598 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 599 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 600 #endif 601 );CHKERRCUSPARSE(stat); 602 cerr = WaitForCUDA();CHKERRCUDA(cerr); 603 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 604 605 /* assign the pointer */ 606 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 607 upTriFactor->AA_h = AAUp; 608 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 609 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 610 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 611 } else { 612 if (!upTriFactor->AA_h) { 613 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 614 } 615 /* Fill the upper triangular matrix */ 616 offset = nzUpper; 617 for (i=n-1; i>=0; i--) { 618 v = aa + adiag[i+1] + 1; 619 620 /* number of elements NOT on the diagonal */ 621 nz = adiag[i] - adiag[i+1]-1; 622 623 /* decrement the offset */ 624 offset -= (nz+1); 625 626 /* first, set the diagonal elements */ 627 upTriFactor->AA_h[offset] = 1./v[nz]; 628 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 629 } 630 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 631 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 632 } 633 } catch(char *ex) { 634 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 635 } 636 } 637 PetscFunctionReturn(0); 638 } 639 640 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 641 { 642 PetscErrorCode ierr; 643 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 644 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 645 IS isrow = a->row,iscol = a->icol; 646 PetscBool row_identity,col_identity; 647 PetscInt n = A->rmap->n; 648 649 PetscFunctionBegin; 650 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 651 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 652 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 653 654 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 655 cusparseTriFactors->nnz=a->nz; 656 657 A->offloadmask = PETSC_OFFLOAD_BOTH; 658 /* lower triangular indices */ 659 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 660 if (!row_identity && !cusparseTriFactors->rpermIndices) { 661 const PetscInt *r; 662 663 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 664 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 665 cusparseTriFactors->rpermIndices->assign(r, r+n); 666 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 667 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 668 } 669 670 /* upper triangular indices */ 671 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 672 if (!col_identity && !cusparseTriFactors->cpermIndices) { 673 const PetscInt *c; 674 675 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 676 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 677 cusparseTriFactors->cpermIndices->assign(c, c+n); 678 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 679 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 680 } 681 PetscFunctionReturn(0); 682 } 683 684 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 685 { 686 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 687 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 688 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 689 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 690 cusparseStatus_t stat; 691 PetscErrorCode ierr; 692 cudaError_t cerr; 693 PetscInt *AiUp, *AjUp; 694 PetscScalar *AAUp; 695 PetscScalar *AALo; 696 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 697 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 698 const PetscInt *ai = b->i,*aj = b->j,*vj; 699 const MatScalar *aa = b->a,*v; 700 701 PetscFunctionBegin; 702 if (!n) PetscFunctionReturn(0); 703 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 704 try { 705 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 706 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 707 if (!upTriFactor && !loTriFactor) { 708 /* Allocate Space for the upper triangular matrix */ 709 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 710 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 711 712 /* Fill the upper triangular matrix */ 713 AiUp[0]=(PetscInt) 0; 714 AiUp[n]=nzUpper; 715 offset = 0; 716 for (i=0; i<n; i++) { 717 /* set the pointers */ 718 v = aa + ai[i]; 719 vj = aj + ai[i]; 720 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 721 722 /* first, set the diagonal elements */ 723 AjUp[offset] = (PetscInt) i; 724 AAUp[offset] = (MatScalar)1.0/v[nz]; 725 AiUp[i] = offset; 726 AALo[offset] = (MatScalar)1.0/v[nz]; 727 728 offset+=1; 729 if (nz>0) { 730 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 731 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 732 for (j=offset; j<offset+nz; j++) { 733 AAUp[j] = -AAUp[j]; 734 AALo[j] = AAUp[j]/v[nz]; 735 } 736 offset+=nz; 737 } 738 } 739 740 /* allocate space for the triangular factor information */ 741 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 742 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 743 744 /* Create the matrix description */ 745 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 746 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 747 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 748 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 749 #else 750 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 751 #endif 752 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 753 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 754 755 /* set the matrix */ 756 upTriFactor->csrMat = new CsrMatrix; 757 upTriFactor->csrMat->num_rows = A->rmap->n; 758 upTriFactor->csrMat->num_cols = A->cmap->n; 759 upTriFactor->csrMat->num_entries = a->nz; 760 761 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 762 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 763 764 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 765 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 766 767 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 768 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 769 770 /* set the operation */ 771 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 772 773 /* Create the solve analysis information */ 774 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 775 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 776 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 777 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 778 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 779 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 780 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 781 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 782 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 783 #endif 784 785 /* perform the solve analysis */ 786 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 787 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 788 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 789 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 790 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 791 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 792 #endif 793 );CHKERRCUSPARSE(stat); 794 cerr = WaitForCUDA();CHKERRCUDA(cerr); 795 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 796 797 /* assign the pointer */ 798 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 799 800 /* allocate space for the triangular factor information */ 801 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 802 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 803 804 /* Create the matrix description */ 805 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 806 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 807 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 808 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 809 #else 810 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 811 #endif 812 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 813 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 814 815 /* set the operation */ 816 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 817 818 /* set the matrix */ 819 loTriFactor->csrMat = new CsrMatrix; 820 loTriFactor->csrMat->num_rows = A->rmap->n; 821 loTriFactor->csrMat->num_cols = A->cmap->n; 822 loTriFactor->csrMat->num_entries = a->nz; 823 824 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 825 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 826 827 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 828 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 829 830 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 831 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 832 833 /* Create the solve analysis information */ 834 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 835 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 836 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 837 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 838 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 839 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 840 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 841 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 842 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 843 #endif 844 845 /* perform the solve analysis */ 846 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 847 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 848 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 849 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 850 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 851 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 852 #endif 853 );CHKERRCUSPARSE(stat); 854 cerr = WaitForCUDA();CHKERRCUDA(cerr); 855 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 856 857 /* assign the pointer */ 858 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 859 860 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 861 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 862 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 863 } else { 864 /* Fill the upper triangular matrix */ 865 offset = 0; 866 for (i=0; i<n; i++) { 867 /* set the pointers */ 868 v = aa + ai[i]; 869 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 870 871 /* first, set the diagonal elements */ 872 AAUp[offset] = 1.0/v[nz]; 873 AALo[offset] = 1.0/v[nz]; 874 875 offset+=1; 876 if (nz>0) { 877 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 878 for (j=offset; j<offset+nz; j++) { 879 AAUp[j] = -AAUp[j]; 880 AALo[j] = AAUp[j]/v[nz]; 881 } 882 offset+=nz; 883 } 884 } 885 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 886 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 887 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 888 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 889 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 890 } 891 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 892 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 893 } catch(char *ex) { 894 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 895 } 896 } 897 PetscFunctionReturn(0); 898 } 899 900 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 901 { 902 PetscErrorCode ierr; 903 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 904 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 905 IS ip = a->row; 906 PetscBool perm_identity; 907 PetscInt n = A->rmap->n; 908 909 PetscFunctionBegin; 910 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 911 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 912 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 913 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 914 915 A->offloadmask = PETSC_OFFLOAD_BOTH; 916 917 /* lower triangular indices */ 918 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 919 if (!perm_identity) { 920 IS iip; 921 const PetscInt *irip,*rip; 922 923 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 924 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 925 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 926 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 927 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 928 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 929 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 930 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 931 ierr = ISDestroy(&iip);CHKERRQ(ierr); 932 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 933 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 934 } 935 PetscFunctionReturn(0); 936 } 937 938 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 939 { 940 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 941 IS isrow = b->row,iscol = b->col; 942 PetscBool row_identity,col_identity; 943 PetscErrorCode ierr; 944 945 PetscFunctionBegin; 946 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 947 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 948 B->offloadmask = PETSC_OFFLOAD_CPU; 949 /* determine which version of MatSolve needs to be used. */ 950 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 951 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 952 if (row_identity && col_identity) { 953 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 954 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 955 B->ops->matsolve = NULL; 956 B->ops->matsolvetranspose = NULL; 957 } else { 958 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 959 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 960 B->ops->matsolve = NULL; 961 B->ops->matsolvetranspose = NULL; 962 } 963 964 /* get the triangular factors */ 965 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 966 PetscFunctionReturn(0); 967 } 968 969 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 970 { 971 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 972 IS ip = b->row; 973 PetscBool perm_identity; 974 PetscErrorCode ierr; 975 976 PetscFunctionBegin; 977 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 978 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 979 B->offloadmask = PETSC_OFFLOAD_CPU; 980 /* determine which version of MatSolve needs to be used. */ 981 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 982 if (perm_identity) { 983 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 984 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 985 B->ops->matsolve = NULL; 986 B->ops->matsolvetranspose = NULL; 987 } else { 988 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 989 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 990 B->ops->matsolve = NULL; 991 B->ops->matsolvetranspose = NULL; 992 } 993 994 /* get the triangular factors */ 995 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 996 PetscFunctionReturn(0); 997 } 998 999 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1000 { 1001 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1002 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1003 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1004 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1005 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1006 cusparseStatus_t stat; 1007 cusparseIndexBase_t indexBase; 1008 cusparseMatrixType_t matrixType; 1009 cusparseFillMode_t fillMode; 1010 cusparseDiagType_t diagType; 1011 cudaError_t cerr; 1012 PetscErrorCode ierr; 1013 1014 PetscFunctionBegin; 1015 /* allocate space for the transpose of the lower triangular factor */ 1016 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1017 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1018 1019 /* set the matrix descriptors of the lower triangular factor */ 1020 matrixType = cusparseGetMatType(loTriFactor->descr); 1021 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1022 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1023 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1024 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1025 1026 /* Create the matrix description */ 1027 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1028 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1029 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1030 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1031 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1032 1033 /* set the operation */ 1034 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1035 1036 /* allocate GPU space for the CSC of the lower triangular factor*/ 1037 loTriFactorT->csrMat = new CsrMatrix; 1038 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1039 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1040 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1041 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1042 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1043 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1044 1045 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1046 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1047 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1048 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1049 loTriFactor->csrMat->values->data().get(), 1050 loTriFactor->csrMat->row_offsets->data().get(), 1051 loTriFactor->csrMat->column_indices->data().get(), 1052 loTriFactorT->csrMat->values->data().get(), 1053 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1054 CUSPARSE_ACTION_NUMERIC,indexBase, 1055 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1056 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1057 #endif 1058 1059 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1060 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1061 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1062 loTriFactor->csrMat->values->data().get(), 1063 loTriFactor->csrMat->row_offsets->data().get(), 1064 loTriFactor->csrMat->column_indices->data().get(), 1065 loTriFactorT->csrMat->values->data().get(), 1066 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1067 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1068 CUSPARSE_ACTION_NUMERIC, indexBase, 1069 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 1070 #else 1071 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1072 CUSPARSE_ACTION_NUMERIC, indexBase 1073 #endif 1074 );CHKERRCUSPARSE(stat); 1075 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1076 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1077 1078 /* Create the solve analysis information */ 1079 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1080 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1081 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1082 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1083 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1084 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1085 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1086 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1087 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1088 #endif 1089 1090 /* perform the solve analysis */ 1091 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1092 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1093 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1094 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 1095 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1096 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1097 #endif 1098 );CHKERRCUSPARSE(stat); 1099 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1100 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1101 1102 /* assign the pointer */ 1103 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1104 1105 /*********************************************/ 1106 /* Now the Transpose of the Upper Tri Factor */ 1107 /*********************************************/ 1108 1109 /* allocate space for the transpose of the upper triangular factor */ 1110 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1111 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1112 1113 /* set the matrix descriptors of the upper triangular factor */ 1114 matrixType = cusparseGetMatType(upTriFactor->descr); 1115 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1116 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1117 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1118 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1119 1120 /* Create the matrix description */ 1121 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1122 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1123 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1124 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1125 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1126 1127 /* set the operation */ 1128 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1129 1130 /* allocate GPU space for the CSC of the upper triangular factor*/ 1131 upTriFactorT->csrMat = new CsrMatrix; 1132 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1133 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1134 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1135 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1136 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1137 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1138 1139 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1140 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1141 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1142 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1143 upTriFactor->csrMat->values->data().get(), 1144 upTriFactor->csrMat->row_offsets->data().get(), 1145 upTriFactor->csrMat->column_indices->data().get(), 1146 upTriFactorT->csrMat->values->data().get(), 1147 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1148 CUSPARSE_ACTION_NUMERIC,indexBase, 1149 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1150 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1151 #endif 1152 1153 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1154 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1155 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1156 upTriFactor->csrMat->values->data().get(), 1157 upTriFactor->csrMat->row_offsets->data().get(), 1158 upTriFactor->csrMat->column_indices->data().get(), 1159 upTriFactorT->csrMat->values->data().get(), 1160 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1161 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1162 CUSPARSE_ACTION_NUMERIC, indexBase, 1163 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1164 #else 1165 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1166 CUSPARSE_ACTION_NUMERIC, indexBase 1167 #endif 1168 );CHKERRCUSPARSE(stat); 1169 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1170 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1171 1172 /* Create the solve analysis information */ 1173 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1174 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1175 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1176 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1177 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1178 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1179 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1180 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1181 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1182 #endif 1183 1184 /* perform the solve analysis */ 1185 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1186 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1187 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1188 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1189 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1190 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1191 #endif 1192 );CHKERRCUSPARSE(stat); 1193 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1194 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1195 1196 /* assign the pointer */ 1197 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1198 PetscFunctionReturn(0); 1199 } 1200 1201 struct PetscScalarToPetscInt 1202 { 1203 __host__ __device__ 1204 PetscInt operator()(PetscScalar s) 1205 { 1206 return (PetscInt)PetscRealPart(s); 1207 } 1208 }; 1209 1210 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 1211 { 1212 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1213 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1214 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1215 cusparseStatus_t stat; 1216 cusparseIndexBase_t indexBase; 1217 cudaError_t err; 1218 PetscErrorCode ierr; 1219 1220 PetscFunctionBegin; 1221 if (!cusparsestruct->transgen || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 1222 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1223 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1224 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct"); 1225 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1226 if (cusparsestruct->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matTranspose struct"); 1227 if (cusparsestruct->transupdated) PetscFunctionReturn(0); 1228 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1229 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1230 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1231 } 1232 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1233 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1234 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1235 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1236 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1237 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1238 1239 /* set alpha and beta */ 1240 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1241 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1242 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1243 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1244 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1245 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1246 1247 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1248 CsrMatrix *matrixT = new CsrMatrix; 1249 matstructT->mat = matrixT; 1250 matrixT->num_rows = A->cmap->n; 1251 matrixT->num_cols = A->rmap->n; 1252 matrixT->num_entries = a->nz; 1253 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1254 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1255 matrixT->values = new THRUSTARRAY(a->nz); 1256 1257 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1258 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1259 1260 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1261 stat = cusparseCreateCsr(&matstructT->matDescr, 1262 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1263 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1264 matrixT->values->data().get(), 1265 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1266 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1267 #endif 1268 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1269 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1270 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1271 #else 1272 CsrMatrix *temp = new CsrMatrix; 1273 CsrMatrix *tempT = new CsrMatrix; 1274 /* First convert HYB to CSR */ 1275 temp->num_rows = A->rmap->n; 1276 temp->num_cols = A->cmap->n; 1277 temp->num_entries = a->nz; 1278 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1279 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1280 temp->values = new THRUSTARRAY(a->nz); 1281 1282 stat = cusparse_hyb2csr(cusparsestruct->handle, 1283 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1284 temp->values->data().get(), 1285 temp->row_offsets->data().get(), 1286 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1287 1288 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1289 tempT->num_rows = A->rmap->n; 1290 tempT->num_cols = A->cmap->n; 1291 tempT->num_entries = a->nz; 1292 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1293 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1294 tempT->values = new THRUSTARRAY(a->nz); 1295 1296 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1297 temp->num_cols, temp->num_entries, 1298 temp->values->data().get(), 1299 temp->row_offsets->data().get(), 1300 temp->column_indices->data().get(), 1301 tempT->values->data().get(), 1302 tempT->column_indices->data().get(), 1303 tempT->row_offsets->data().get(), 1304 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1305 1306 /* Last, convert CSC to HYB */ 1307 cusparseHybMat_t hybMat; 1308 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1309 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1310 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1311 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1312 matstructT->descr, tempT->values->data().get(), 1313 tempT->row_offsets->data().get(), 1314 tempT->column_indices->data().get(), 1315 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1316 1317 /* assign the pointer */ 1318 matstructT->mat = hybMat; 1319 cusparsestruct->transupdated = PETSC_TRUE; 1320 /* delete temporaries */ 1321 if (tempT) { 1322 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1323 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1324 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1325 delete (CsrMatrix*) tempT; 1326 } 1327 if (temp) { 1328 if (temp->values) delete (THRUSTARRAY*) temp->values; 1329 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1330 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1331 delete (CsrMatrix*) temp; 1332 } 1333 #endif 1334 } 1335 } 1336 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1337 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1338 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1339 if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix"); 1340 if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix rows"); 1341 if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix cols"); 1342 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix values"); 1343 if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT"); 1344 if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT rows"); 1345 if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT cols"); 1346 if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT values"); 1347 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1348 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1349 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1350 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1351 } 1352 if (!cusparsestruct->csr2csc_i) { 1353 THRUSTARRAY csr2csc_a(matrix->num_entries); 1354 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1355 1356 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1357 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1358 void *csr2cscBuffer; 1359 size_t csr2cscBufferSize; 1360 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1361 A->cmap->n, matrix->num_entries, 1362 matrix->values->data().get(), 1363 cusparsestruct->rowoffsets_gpu->data().get(), 1364 matrix->column_indices->data().get(), 1365 matrixT->values->data().get(), 1366 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1367 CUSPARSE_ACTION_NUMERIC,indexBase, 1368 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1369 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1370 #endif 1371 1372 stat = cusparse_csr2csc(cusparsestruct->handle, 1373 A->rmap->n,A->cmap->n,matrix->num_entries, 1374 csr2csc_a.data().get(),cusparsestruct->rowoffsets_gpu->data().get(),matrix->column_indices->data().get(), 1375 matrixT->values->data().get(), 1376 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1377 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1378 CUSPARSE_ACTION_NUMERIC,indexBase, 1379 cusparsestruct->csr2cscAlg, csr2cscBuffer 1380 #else 1381 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1382 CUSPARSE_ACTION_NUMERIC, indexBase 1383 #endif 1384 );CHKERRCUSPARSE(stat); 1385 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1386 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1387 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1388 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1389 #endif 1390 } 1391 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1392 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1393 matrixT->values->begin())); 1394 } 1395 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1396 /* the compressed row indices is not used for matTranspose */ 1397 matstructT->cprowIndices = NULL; 1398 /* assign the pointer */ 1399 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1400 cusparsestruct->transupdated = PETSC_TRUE; 1401 PetscFunctionReturn(0); 1402 } 1403 1404 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1405 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1406 { 1407 PetscInt n = xx->map->n; 1408 const PetscScalar *barray; 1409 PetscScalar *xarray; 1410 thrust::device_ptr<const PetscScalar> bGPU; 1411 thrust::device_ptr<PetscScalar> xGPU; 1412 cusparseStatus_t stat; 1413 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1414 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1415 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1416 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1417 PetscErrorCode ierr; 1418 cudaError_t cerr; 1419 1420 PetscFunctionBegin; 1421 /* Analyze the matrix and create the transpose ... on the fly */ 1422 if (!loTriFactorT && !upTriFactorT) { 1423 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1424 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1425 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1426 } 1427 1428 /* Get the GPU pointers */ 1429 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1430 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1431 xGPU = thrust::device_pointer_cast(xarray); 1432 bGPU = thrust::device_pointer_cast(barray); 1433 1434 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1435 /* First, reorder with the row permutation */ 1436 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1437 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1438 xGPU); 1439 1440 /* First, solve U */ 1441 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1442 upTriFactorT->csrMat->num_rows, 1443 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1444 upTriFactorT->csrMat->num_entries, 1445 #endif 1446 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1447 upTriFactorT->csrMat->values->data().get(), 1448 upTriFactorT->csrMat->row_offsets->data().get(), 1449 upTriFactorT->csrMat->column_indices->data().get(), 1450 upTriFactorT->solveInfo, 1451 xarray, tempGPU->data().get() 1452 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1453 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1454 #endif 1455 );CHKERRCUSPARSE(stat); 1456 1457 /* Then, solve L */ 1458 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1459 loTriFactorT->csrMat->num_rows, 1460 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1461 loTriFactorT->csrMat->num_entries, 1462 #endif 1463 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1464 loTriFactorT->csrMat->values->data().get(), 1465 loTriFactorT->csrMat->row_offsets->data().get(), 1466 loTriFactorT->csrMat->column_indices->data().get(), 1467 loTriFactorT->solveInfo, 1468 tempGPU->data().get(), xarray 1469 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1470 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1471 #endif 1472 );CHKERRCUSPARSE(stat); 1473 1474 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1475 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1476 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1477 tempGPU->begin()); 1478 1479 /* Copy the temporary to the full solution. */ 1480 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1481 1482 /* restore */ 1483 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1484 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1485 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1486 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1487 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1488 PetscFunctionReturn(0); 1489 } 1490 1491 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1492 { 1493 const PetscScalar *barray; 1494 PetscScalar *xarray; 1495 cusparseStatus_t stat; 1496 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1497 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1498 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1499 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1500 PetscErrorCode ierr; 1501 cudaError_t cerr; 1502 1503 PetscFunctionBegin; 1504 /* Analyze the matrix and create the transpose ... on the fly */ 1505 if (!loTriFactorT && !upTriFactorT) { 1506 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1507 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1508 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1509 } 1510 1511 /* Get the GPU pointers */ 1512 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1513 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1514 1515 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1516 /* First, solve U */ 1517 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1518 upTriFactorT->csrMat->num_rows, 1519 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1520 upTriFactorT->csrMat->num_entries, 1521 #endif 1522 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1523 upTriFactorT->csrMat->values->data().get(), 1524 upTriFactorT->csrMat->row_offsets->data().get(), 1525 upTriFactorT->csrMat->column_indices->data().get(), 1526 upTriFactorT->solveInfo, 1527 barray, tempGPU->data().get() 1528 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1529 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1530 #endif 1531 );CHKERRCUSPARSE(stat); 1532 1533 /* Then, solve L */ 1534 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1535 loTriFactorT->csrMat->num_rows, 1536 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1537 loTriFactorT->csrMat->num_entries, 1538 #endif 1539 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1540 loTriFactorT->csrMat->values->data().get(), 1541 loTriFactorT->csrMat->row_offsets->data().get(), 1542 loTriFactorT->csrMat->column_indices->data().get(), 1543 loTriFactorT->solveInfo, 1544 tempGPU->data().get(), xarray 1545 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1546 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1547 #endif 1548 );CHKERRCUSPARSE(stat); 1549 1550 /* restore */ 1551 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1552 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1553 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1554 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1555 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1556 PetscFunctionReturn(0); 1557 } 1558 1559 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1560 { 1561 const PetscScalar *barray; 1562 PetscScalar *xarray; 1563 thrust::device_ptr<const PetscScalar> bGPU; 1564 thrust::device_ptr<PetscScalar> xGPU; 1565 cusparseStatus_t stat; 1566 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1567 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1568 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1569 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1570 PetscErrorCode ierr; 1571 cudaError_t cerr; 1572 1573 PetscFunctionBegin; 1574 1575 /* Get the GPU pointers */ 1576 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1577 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1578 xGPU = thrust::device_pointer_cast(xarray); 1579 bGPU = thrust::device_pointer_cast(barray); 1580 1581 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1582 /* First, reorder with the row permutation */ 1583 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1584 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1585 tempGPU->begin()); 1586 1587 /* Next, solve L */ 1588 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1589 loTriFactor->csrMat->num_rows, 1590 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1591 loTriFactor->csrMat->num_entries, 1592 #endif 1593 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1594 loTriFactor->csrMat->values->data().get(), 1595 loTriFactor->csrMat->row_offsets->data().get(), 1596 loTriFactor->csrMat->column_indices->data().get(), 1597 loTriFactor->solveInfo, 1598 tempGPU->data().get(), xarray 1599 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1600 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1601 #endif 1602 );CHKERRCUSPARSE(stat); 1603 1604 /* Then, solve U */ 1605 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1606 upTriFactor->csrMat->num_rows, 1607 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1608 upTriFactor->csrMat->num_entries, 1609 #endif 1610 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1611 upTriFactor->csrMat->values->data().get(), 1612 upTriFactor->csrMat->row_offsets->data().get(), 1613 upTriFactor->csrMat->column_indices->data().get(), 1614 upTriFactor->solveInfo, 1615 xarray, tempGPU->data().get() 1616 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1617 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1618 #endif 1619 );CHKERRCUSPARSE(stat); 1620 1621 /* Last, reorder with the column permutation */ 1622 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1623 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1624 xGPU); 1625 1626 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1627 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1628 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1629 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1630 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1631 PetscFunctionReturn(0); 1632 } 1633 1634 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1635 { 1636 const PetscScalar *barray; 1637 PetscScalar *xarray; 1638 cusparseStatus_t stat; 1639 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1640 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1641 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1642 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1643 PetscErrorCode ierr; 1644 cudaError_t cerr; 1645 1646 PetscFunctionBegin; 1647 /* Get the GPU pointers */ 1648 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1649 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1650 1651 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1652 /* First, solve L */ 1653 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1654 loTriFactor->csrMat->num_rows, 1655 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1656 loTriFactor->csrMat->num_entries, 1657 #endif 1658 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1659 loTriFactor->csrMat->values->data().get(), 1660 loTriFactor->csrMat->row_offsets->data().get(), 1661 loTriFactor->csrMat->column_indices->data().get(), 1662 loTriFactor->solveInfo, 1663 barray, tempGPU->data().get() 1664 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1665 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1666 #endif 1667 );CHKERRCUSPARSE(stat); 1668 1669 /* Next, solve U */ 1670 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1671 upTriFactor->csrMat->num_rows, 1672 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1673 upTriFactor->csrMat->num_entries, 1674 #endif 1675 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1676 upTriFactor->csrMat->values->data().get(), 1677 upTriFactor->csrMat->row_offsets->data().get(), 1678 upTriFactor->csrMat->column_indices->data().get(), 1679 upTriFactor->solveInfo, 1680 tempGPU->data().get(), xarray 1681 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1682 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1683 #endif 1684 );CHKERRCUSPARSE(stat); 1685 1686 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1687 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1688 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1689 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1690 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1691 PetscFunctionReturn(0); 1692 } 1693 1694 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1695 { 1696 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1697 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1698 cudaError_t cerr; 1699 PetscErrorCode ierr; 1700 1701 PetscFunctionBegin; 1702 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1703 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1704 1705 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1706 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1707 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1708 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1709 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1710 A->offloadmask = PETSC_OFFLOAD_BOTH; 1711 } 1712 PetscFunctionReturn(0); 1713 } 1714 1715 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1716 { 1717 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1718 PetscErrorCode ierr; 1719 1720 PetscFunctionBegin; 1721 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1722 *array = a->a; 1723 A->offloadmask = PETSC_OFFLOAD_CPU; 1724 PetscFunctionReturn(0); 1725 } 1726 1727 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1728 { 1729 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1730 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1731 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1732 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1733 PetscErrorCode ierr; 1734 cusparseStatus_t stat; 1735 PetscBool both = PETSC_TRUE; 1736 cudaError_t err; 1737 1738 PetscFunctionBegin; 1739 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU"); 1740 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1741 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1742 CsrMatrix *matrix; 1743 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1744 1745 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values"); 1746 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1747 matrix->values->assign(a->a, a->a+a->nz); 1748 err = WaitForCUDA();CHKERRCUDA(err); 1749 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1750 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1751 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1752 } else { 1753 PetscInt nnz; 1754 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1755 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1756 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1757 delete cusparsestruct->workVector; 1758 delete cusparsestruct->rowoffsets_gpu; 1759 cusparsestruct->workVector = NULL; 1760 cusparsestruct->rowoffsets_gpu = NULL; 1761 try { 1762 if (a->compressedrow.use) { 1763 m = a->compressedrow.nrows; 1764 ii = a->compressedrow.i; 1765 ridx = a->compressedrow.rindex; 1766 } else { 1767 m = A->rmap->n; 1768 ii = a->i; 1769 ridx = NULL; 1770 } 1771 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data"); 1772 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data"); 1773 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1774 else nnz = a->nz; 1775 1776 /* create cusparse matrix */ 1777 cusparsestruct->nrows = m; 1778 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1779 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1780 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1781 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1782 1783 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1784 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1785 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1786 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1787 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1788 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1789 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1790 1791 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1792 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1793 /* set the matrix */ 1794 CsrMatrix *mat= new CsrMatrix; 1795 mat->num_rows = m; 1796 mat->num_cols = A->cmap->n; 1797 mat->num_entries = nnz; 1798 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1799 mat->row_offsets->assign(ii, ii + m+1); 1800 1801 mat->column_indices = new THRUSTINTARRAY32(nnz); 1802 mat->column_indices->assign(a->j, a->j+nnz); 1803 1804 mat->values = new THRUSTARRAY(nnz); 1805 if (a->a) mat->values->assign(a->a, a->a+nnz); 1806 1807 /* assign the pointer */ 1808 matstruct->mat = mat; 1809 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1810 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1811 stat = cusparseCreateCsr(&matstruct->matDescr, 1812 mat->num_rows, mat->num_cols, mat->num_entries, 1813 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1814 mat->values->data().get(), 1815 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1816 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1817 } 1818 #endif 1819 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1820 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1821 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1822 #else 1823 CsrMatrix *mat= new CsrMatrix; 1824 mat->num_rows = m; 1825 mat->num_cols = A->cmap->n; 1826 mat->num_entries = nnz; 1827 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1828 mat->row_offsets->assign(ii, ii + m+1); 1829 1830 mat->column_indices = new THRUSTINTARRAY32(nnz); 1831 mat->column_indices->assign(a->j, a->j+nnz); 1832 1833 mat->values = new THRUSTARRAY(nnz); 1834 if (a->a) mat->values->assign(a->a, a->a+nnz); 1835 1836 cusparseHybMat_t hybMat; 1837 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1838 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1839 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1840 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1841 matstruct->descr, mat->values->data().get(), 1842 mat->row_offsets->data().get(), 1843 mat->column_indices->data().get(), 1844 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1845 /* assign the pointer */ 1846 matstruct->mat = hybMat; 1847 1848 if (mat) { 1849 if (mat->values) delete (THRUSTARRAY*)mat->values; 1850 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1851 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1852 delete (CsrMatrix*)mat; 1853 } 1854 #endif 1855 } 1856 1857 /* assign the compressed row indices */ 1858 if (a->compressedrow.use) { 1859 cusparsestruct->workVector = new THRUSTARRAY(m); 1860 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1861 matstruct->cprowIndices->assign(ridx,ridx+m); 1862 tmp = m; 1863 } else { 1864 cusparsestruct->workVector = NULL; 1865 matstruct->cprowIndices = NULL; 1866 tmp = 0; 1867 } 1868 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1869 1870 /* assign the pointer */ 1871 cusparsestruct->mat = matstruct; 1872 } catch(char *ex) { 1873 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1874 } 1875 err = WaitForCUDA();CHKERRCUDA(err); 1876 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1877 cusparsestruct->nonzerostate = A->nonzerostate; 1878 } 1879 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1880 } 1881 PetscFunctionReturn(0); 1882 } 1883 1884 struct VecCUDAPlusEquals 1885 { 1886 template <typename Tuple> 1887 __host__ __device__ 1888 void operator()(Tuple t) 1889 { 1890 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1891 } 1892 }; 1893 1894 struct VecCUDAEquals 1895 { 1896 template <typename Tuple> 1897 __host__ __device__ 1898 void operator()(Tuple t) 1899 { 1900 thrust::get<1>(t) = thrust::get<0>(t); 1901 } 1902 }; 1903 1904 struct VecCUDAEqualsReverse 1905 { 1906 template <typename Tuple> 1907 __host__ __device__ 1908 void operator()(Tuple t) 1909 { 1910 thrust::get<0>(t) = thrust::get<1>(t); 1911 } 1912 }; 1913 1914 struct MatMatCusparse { 1915 PetscBool cisdense; 1916 PetscScalar *Bt; 1917 Mat X; 1918 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1919 PetscLogDouble flops; 1920 CsrMatrix *Bcsr; 1921 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1922 cusparseSpMatDescr_t matSpBDescr; 1923 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1924 cusparseDnMatDescr_t matBDescr; 1925 cusparseDnMatDescr_t matCDescr; 1926 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1927 size_t mmBufferSize; 1928 void *mmBuffer; 1929 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1930 cusparseSpGEMMDescr_t spgemmDesc; 1931 #endif 1932 }; 1933 1934 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1935 { 1936 PetscErrorCode ierr; 1937 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1938 cudaError_t cerr; 1939 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1940 cusparseStatus_t stat; 1941 #endif 1942 1943 PetscFunctionBegin; 1944 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1945 delete mmdata->Bcsr; 1946 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1947 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1948 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1949 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1950 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1951 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1952 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1953 #endif 1954 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1955 ierr = PetscFree(data);CHKERRQ(ierr); 1956 PetscFunctionReturn(0); 1957 } 1958 1959 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1960 1961 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1962 { 1963 Mat_Product *product = C->product; 1964 Mat A,B; 1965 PetscInt m,n,blda,clda; 1966 PetscBool flg,biscuda; 1967 Mat_SeqAIJCUSPARSE *cusp; 1968 cusparseStatus_t stat; 1969 cusparseOperation_t opA; 1970 const PetscScalar *barray; 1971 PetscScalar *carray; 1972 PetscErrorCode ierr; 1973 MatMatCusparse *mmdata; 1974 Mat_SeqAIJCUSPARSEMultStruct *mat; 1975 CsrMatrix *csrmat; 1976 cudaError_t cerr; 1977 1978 PetscFunctionBegin; 1979 MatCheckProduct(C,1); 1980 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1981 mmdata = (MatMatCusparse*)product->data; 1982 A = product->A; 1983 B = product->B; 1984 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1985 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1986 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1987 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1988 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1989 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1990 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1991 switch (product->type) { 1992 case MATPRODUCT_AB: 1993 case MATPRODUCT_PtAP: 1994 mat = cusp->mat; 1995 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1996 m = A->rmap->n; 1997 n = B->cmap->n; 1998 break; 1999 case MATPRODUCT_AtB: 2000 if (!cusp->transgen) { 2001 mat = cusp->mat; 2002 opA = CUSPARSE_OPERATION_TRANSPOSE; 2003 } else { 2004 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 2005 mat = cusp->matTranspose; 2006 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2007 } 2008 m = A->cmap->n; 2009 n = B->cmap->n; 2010 break; 2011 case MATPRODUCT_ABt: 2012 case MATPRODUCT_RARt: 2013 mat = cusp->mat; 2014 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2015 m = A->rmap->n; 2016 n = B->rmap->n; 2017 break; 2018 default: 2019 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2020 } 2021 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2022 csrmat = (CsrMatrix*)mat->mat; 2023 /* if the user passed a CPU matrix, copy the data to the GPU */ 2024 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2025 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2026 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2027 2028 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2029 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2030 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2031 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2032 } else { 2033 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2034 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2035 } 2036 2037 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2038 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2039 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2040 /* (re)allcoate mmBuffer if not initialized or LDAs are different */ 2041 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2042 size_t mmBufferSize; 2043 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2044 if (!mmdata->matBDescr) { 2045 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2046 mmdata->Blda = blda; 2047 } 2048 2049 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2050 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2051 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2052 mmdata->Clda = clda; 2053 } 2054 2055 if (!mat->matDescr) { 2056 stat = cusparseCreateCsr(&mat->matDescr, 2057 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2058 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2059 csrmat->values->data().get(), 2060 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2061 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2062 } 2063 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2064 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2065 mmdata->matCDescr,cusparse_scalartype, 2066 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2067 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2068 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2069 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2070 mmdata->mmBufferSize = mmBufferSize; 2071 } 2072 mmdata->initialized = PETSC_TRUE; 2073 } else { 2074 /* to be safe, always update pointers of the mats */ 2075 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2076 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2077 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2078 } 2079 2080 /* do cusparseSpMM, which supports transpose on B */ 2081 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2082 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2083 mmdata->matCDescr,cusparse_scalartype, 2084 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2085 #else 2086 PetscInt k; 2087 /* cusparseXcsrmm does not support transpose on B */ 2088 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2089 cublasHandle_t cublasv2handle; 2090 cublasStatus_t cerr; 2091 2092 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2093 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2094 B->cmap->n,B->rmap->n, 2095 &PETSC_CUSPARSE_ONE ,barray,blda, 2096 &PETSC_CUSPARSE_ZERO,barray,blda, 2097 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2098 blda = B->cmap->n; 2099 k = B->cmap->n; 2100 } else { 2101 k = B->rmap->n; 2102 } 2103 2104 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2105 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2106 csrmat->num_entries,mat->alpha_one,mat->descr, 2107 csrmat->values->data().get(), 2108 csrmat->row_offsets->data().get(), 2109 csrmat->column_indices->data().get(), 2110 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2111 carray,clda);CHKERRCUSPARSE(stat); 2112 #endif 2113 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2114 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2115 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2116 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2117 if (product->type == MATPRODUCT_RARt) { 2118 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2119 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2120 } else if (product->type == MATPRODUCT_PtAP) { 2121 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2122 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2123 } else { 2124 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2125 } 2126 if (mmdata->cisdense) { 2127 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2128 } 2129 if (!biscuda) { 2130 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2131 } 2132 PetscFunctionReturn(0); 2133 } 2134 2135 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2136 { 2137 Mat_Product *product = C->product; 2138 Mat A,B; 2139 PetscInt m,n; 2140 PetscBool cisdense,flg; 2141 PetscErrorCode ierr; 2142 MatMatCusparse *mmdata; 2143 Mat_SeqAIJCUSPARSE *cusp; 2144 2145 PetscFunctionBegin; 2146 MatCheckProduct(C,1); 2147 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2148 A = product->A; 2149 B = product->B; 2150 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2151 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2152 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2153 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2154 switch (product->type) { 2155 case MATPRODUCT_AB: 2156 m = A->rmap->n; 2157 n = B->cmap->n; 2158 break; 2159 case MATPRODUCT_AtB: 2160 m = A->cmap->n; 2161 n = B->cmap->n; 2162 break; 2163 case MATPRODUCT_ABt: 2164 m = A->rmap->n; 2165 n = B->rmap->n; 2166 break; 2167 case MATPRODUCT_PtAP: 2168 m = B->cmap->n; 2169 n = B->cmap->n; 2170 break; 2171 case MATPRODUCT_RARt: 2172 m = B->rmap->n; 2173 n = B->rmap->n; 2174 break; 2175 default: 2176 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2177 } 2178 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2179 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2180 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2181 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2182 2183 /* product data */ 2184 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2185 mmdata->cisdense = cisdense; 2186 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2187 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2188 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2189 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2190 } 2191 #endif 2192 /* for these products we need intermediate storage */ 2193 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2194 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2195 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2196 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2197 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2198 } else { 2199 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2200 } 2201 } 2202 C->product->data = mmdata; 2203 C->product->destroy = MatDestroy_MatMatCusparse; 2204 2205 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2206 PetscFunctionReturn(0); 2207 } 2208 2209 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2210 { 2211 Mat_Product *product = C->product; 2212 Mat A,B; 2213 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2214 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2215 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2216 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2217 PetscBool flg; 2218 PetscErrorCode ierr; 2219 cusparseStatus_t stat; 2220 cudaError_t cerr; 2221 MatProductType ptype; 2222 MatMatCusparse *mmdata; 2223 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2224 cusparseSpMatDescr_t BmatSpDescr; 2225 #endif 2226 2227 PetscFunctionBegin; 2228 MatCheckProduct(C,1); 2229 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 2230 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2231 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name); 2232 mmdata = (MatMatCusparse*)C->product->data; 2233 A = product->A; 2234 B = product->B; 2235 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2236 mmdata->reusesym = PETSC_FALSE; 2237 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2238 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2239 Cmat = Ccusp->mat; 2240 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2241 Ccsr = (CsrMatrix*)Cmat->mat; 2242 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2243 goto finalize; 2244 } 2245 if (!c->nz) goto finalize; 2246 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2247 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2248 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2249 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2250 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2251 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2252 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2253 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2254 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2255 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2256 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2257 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2258 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2259 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2260 2261 ptype = product->type; 2262 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2263 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2264 switch (ptype) { 2265 case MATPRODUCT_AB: 2266 Amat = Acusp->mat; 2267 Bmat = Bcusp->mat; 2268 break; 2269 case MATPRODUCT_AtB: 2270 Amat = Acusp->matTranspose; 2271 Bmat = Bcusp->mat; 2272 break; 2273 case MATPRODUCT_ABt: 2274 Amat = Acusp->mat; 2275 Bmat = Bcusp->matTranspose; 2276 break; 2277 default: 2278 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2279 } 2280 Cmat = Ccusp->mat; 2281 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2282 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2283 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2284 Acsr = (CsrMatrix*)Amat->mat; 2285 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2286 Ccsr = (CsrMatrix*)Cmat->mat; 2287 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2288 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2289 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2290 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2291 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2292 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2293 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2294 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2295 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2296 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2297 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2298 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2299 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2300 #else 2301 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2302 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2303 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2304 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2305 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2306 #endif 2307 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2308 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2309 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2310 C->offloadmask = PETSC_OFFLOAD_GPU; 2311 finalize: 2312 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2313 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2314 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2315 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2316 c->reallocs = 0; 2317 C->info.mallocs += 0; 2318 C->info.nz_unneeded = 0; 2319 C->assembled = C->was_assembled = PETSC_TRUE; 2320 C->num_ass++; 2321 PetscFunctionReturn(0); 2322 } 2323 2324 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2325 { 2326 Mat_Product *product = C->product; 2327 Mat A,B; 2328 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2329 Mat_SeqAIJ *a,*b,*c; 2330 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2331 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2332 PetscInt i,j,m,n,k; 2333 PetscBool flg; 2334 PetscErrorCode ierr; 2335 cusparseStatus_t stat; 2336 cudaError_t cerr; 2337 MatProductType ptype; 2338 MatMatCusparse *mmdata; 2339 PetscLogDouble flops; 2340 PetscBool biscompressed,ciscompressed; 2341 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2342 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2343 size_t bufSize2; 2344 cusparseSpMatDescr_t BmatSpDescr; 2345 #else 2346 int cnz; 2347 #endif 2348 2349 PetscFunctionBegin; 2350 MatCheckProduct(C,1); 2351 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2352 A = product->A; 2353 B = product->B; 2354 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2355 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2356 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2357 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2358 a = (Mat_SeqAIJ*)A->data; 2359 b = (Mat_SeqAIJ*)B->data; 2360 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2361 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2362 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2363 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2364 2365 /* product data */ 2366 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2367 C->product->data = mmdata; 2368 C->product->destroy = MatDestroy_MatMatCusparse; 2369 2370 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2371 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2372 ptype = product->type; 2373 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2374 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2375 biscompressed = PETSC_FALSE; 2376 ciscompressed = PETSC_FALSE; 2377 switch (ptype) { 2378 case MATPRODUCT_AB: 2379 m = A->rmap->n; 2380 n = B->cmap->n; 2381 k = A->cmap->n; 2382 Amat = Acusp->mat; 2383 Bmat = Bcusp->mat; 2384 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2385 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2386 break; 2387 case MATPRODUCT_AtB: 2388 m = A->cmap->n; 2389 n = B->cmap->n; 2390 k = A->rmap->n; 2391 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 2392 Amat = Acusp->matTranspose; 2393 Bmat = Bcusp->mat; 2394 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2395 break; 2396 case MATPRODUCT_ABt: 2397 m = A->rmap->n; 2398 n = B->rmap->n; 2399 k = A->cmap->n; 2400 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr); 2401 Amat = Acusp->mat; 2402 Bmat = Bcusp->matTranspose; 2403 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2404 break; 2405 default: 2406 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2407 } 2408 2409 /* create cusparse matrix */ 2410 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2411 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2412 c = (Mat_SeqAIJ*)C->data; 2413 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2414 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2415 Ccsr = new CsrMatrix; 2416 2417 c->compressedrow.use = ciscompressed; 2418 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2419 c->compressedrow.nrows = a->compressedrow.nrows; 2420 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2421 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2422 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2423 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2424 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2425 } else { 2426 c->compressedrow.nrows = 0; 2427 c->compressedrow.i = NULL; 2428 c->compressedrow.rindex = NULL; 2429 Ccusp->workVector = NULL; 2430 Cmat->cprowIndices = NULL; 2431 } 2432 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2433 Ccusp->mat = Cmat; 2434 Ccusp->mat->mat = Ccsr; 2435 Ccsr->num_rows = Ccusp->nrows; 2436 Ccsr->num_cols = n; 2437 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2438 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2439 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2440 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2441 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2442 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2443 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2444 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2445 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2446 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2447 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2448 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2449 c->nz = 0; 2450 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2451 Ccsr->values = new THRUSTARRAY(c->nz); 2452 goto finalizesym; 2453 } 2454 2455 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2456 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2457 Acsr = (CsrMatrix*)Amat->mat; 2458 if (!biscompressed) { 2459 Bcsr = (CsrMatrix*)Bmat->mat; 2460 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2461 BmatSpDescr = Bmat->matDescr; 2462 #endif 2463 } else { /* we need to use row offsets for the full matrix */ 2464 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2465 Bcsr = new CsrMatrix; 2466 Bcsr->num_rows = B->rmap->n; 2467 Bcsr->num_cols = cBcsr->num_cols; 2468 Bcsr->num_entries = cBcsr->num_entries; 2469 Bcsr->column_indices = cBcsr->column_indices; 2470 Bcsr->values = cBcsr->values; 2471 if (!Bcusp->rowoffsets_gpu) { 2472 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2473 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2474 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2475 } 2476 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2477 mmdata->Bcsr = Bcsr; 2478 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2479 if (Bcsr->num_rows && Bcsr->num_cols) { 2480 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2481 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2482 Bcsr->values->data().get(), 2483 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2484 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2485 } 2486 BmatSpDescr = mmdata->matSpBDescr; 2487 #endif 2488 } 2489 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2490 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2491 /* precompute flops count */ 2492 if (ptype == MATPRODUCT_AB) { 2493 for (i=0, flops = 0; i<A->rmap->n; i++) { 2494 const PetscInt st = a->i[i]; 2495 const PetscInt en = a->i[i+1]; 2496 for (j=st; j<en; j++) { 2497 const PetscInt brow = a->j[j]; 2498 flops += 2.*(b->i[brow+1] - b->i[brow]); 2499 } 2500 } 2501 } else if (ptype == MATPRODUCT_AtB) { 2502 for (i=0, flops = 0; i<A->rmap->n; i++) { 2503 const PetscInt anzi = a->i[i+1] - a->i[i]; 2504 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2505 flops += (2.*anzi)*bnzi; 2506 } 2507 } else { /* TODO */ 2508 flops = 0.; 2509 } 2510 2511 mmdata->flops = flops; 2512 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2513 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2514 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2515 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2516 NULL, NULL, NULL, 2517 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2518 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2519 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2520 /* ask bufferSize bytes for external memory */ 2521 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2522 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2523 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2524 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2525 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2526 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2527 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2528 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2529 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2530 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2531 /* ask bufferSize again bytes for external memory */ 2532 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2533 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2534 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2535 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2536 /* The CUSPARSE documentation is not clear, nor the API 2537 We need both buffers to perform the operations properly! 2538 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2539 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2540 is stored in the descriptor! What a messy API... */ 2541 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2542 /* compute the intermediate product of A * B */ 2543 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2544 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2545 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2546 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2547 /* get matrix C non-zero entries C_nnz1 */ 2548 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2549 c->nz = (PetscInt) C_nnz1; 2550 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2551 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2552 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2553 Ccsr->values = new THRUSTARRAY(c->nz); 2554 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2555 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2556 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2557 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2558 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2559 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2560 #else 2561 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2562 stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2563 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2564 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2565 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2566 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2567 c->nz = cnz; 2568 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2569 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2570 Ccsr->values = new THRUSTARRAY(c->nz); 2571 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2572 2573 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2574 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2575 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2576 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2577 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2578 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2579 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2580 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2581 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2582 #endif 2583 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2584 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2585 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2586 finalizesym: 2587 c->singlemalloc = PETSC_FALSE; 2588 c->free_a = PETSC_TRUE; 2589 c->free_ij = PETSC_TRUE; 2590 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2591 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2592 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2593 PetscInt *d_i = c->i; 2594 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2595 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2596 ii = *Ccsr->row_offsets; 2597 jj = *Ccsr->column_indices; 2598 if (ciscompressed) d_i = c->compressedrow.i; 2599 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2600 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2601 } else { 2602 PetscInt *d_i = c->i; 2603 if (ciscompressed) d_i = c->compressedrow.i; 2604 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2605 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2606 } 2607 if (ciscompressed) { /* need to expand host row offsets */ 2608 PetscInt r = 0; 2609 c->i[0] = 0; 2610 for (k = 0; k < c->compressedrow.nrows; k++) { 2611 const PetscInt next = c->compressedrow.rindex[k]; 2612 const PetscInt old = c->compressedrow.i[k]; 2613 for (; r < next; r++) c->i[r+1] = old; 2614 } 2615 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2616 } 2617 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2618 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2619 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2620 c->maxnz = c->nz; 2621 c->nonzerorowcnt = 0; 2622 c->rmax = 0; 2623 for (k = 0; k < m; k++) { 2624 const PetscInt nn = c->i[k+1] - c->i[k]; 2625 c->ilen[k] = c->imax[k] = nn; 2626 c->nonzerorowcnt += (PetscInt)!!nn; 2627 c->rmax = PetscMax(c->rmax,nn); 2628 } 2629 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2630 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2631 Ccsr->num_entries = c->nz; 2632 2633 C->nonzerostate++; 2634 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2635 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2636 Ccusp->nonzerostate = C->nonzerostate; 2637 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2638 C->preallocated = PETSC_TRUE; 2639 C->assembled = PETSC_FALSE; 2640 C->was_assembled = PETSC_FALSE; 2641 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2642 mmdata->reusesym = PETSC_TRUE; 2643 C->offloadmask = PETSC_OFFLOAD_GPU; 2644 } 2645 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2646 PetscFunctionReturn(0); 2647 } 2648 2649 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2650 2651 /* handles sparse or dense B */ 2652 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2653 { 2654 Mat_Product *product = mat->product; 2655 PetscErrorCode ierr; 2656 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2657 2658 PetscFunctionBegin; 2659 MatCheckProduct(mat,1); 2660 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2661 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2662 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2663 } 2664 if (product->type == MATPRODUCT_ABC) { 2665 Ciscusp = PETSC_FALSE; 2666 if (!product->C->boundtocpu) { 2667 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2668 } 2669 } 2670 if (isdense) { 2671 switch (product->type) { 2672 case MATPRODUCT_AB: 2673 case MATPRODUCT_AtB: 2674 case MATPRODUCT_ABt: 2675 case MATPRODUCT_PtAP: 2676 case MATPRODUCT_RARt: 2677 if (product->A->boundtocpu) { 2678 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2679 } else { 2680 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2681 } 2682 break; 2683 case MATPRODUCT_ABC: 2684 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2685 break; 2686 default: 2687 break; 2688 } 2689 } else if (Biscusp && Ciscusp) { 2690 switch (product->type) { 2691 case MATPRODUCT_AB: 2692 case MATPRODUCT_AtB: 2693 case MATPRODUCT_ABt: 2694 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2695 break; 2696 case MATPRODUCT_PtAP: 2697 case MATPRODUCT_RARt: 2698 case MATPRODUCT_ABC: 2699 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2700 break; 2701 default: 2702 break; 2703 } 2704 } else { /* fallback for AIJ */ 2705 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2706 } 2707 PetscFunctionReturn(0); 2708 } 2709 2710 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2711 { 2712 PetscErrorCode ierr; 2713 2714 PetscFunctionBegin; 2715 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2716 PetscFunctionReturn(0); 2717 } 2718 2719 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2720 { 2721 PetscErrorCode ierr; 2722 2723 PetscFunctionBegin; 2724 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2725 PetscFunctionReturn(0); 2726 } 2727 2728 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2729 { 2730 PetscErrorCode ierr; 2731 2732 PetscFunctionBegin; 2733 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2734 PetscFunctionReturn(0); 2735 } 2736 2737 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2738 { 2739 PetscErrorCode ierr; 2740 2741 PetscFunctionBegin; 2742 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2743 PetscFunctionReturn(0); 2744 } 2745 2746 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2747 { 2748 PetscErrorCode ierr; 2749 2750 PetscFunctionBegin; 2751 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2752 PetscFunctionReturn(0); 2753 } 2754 2755 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2756 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2757 { 2758 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2759 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2760 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2761 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2762 PetscErrorCode ierr; 2763 cudaError_t cerr; 2764 cusparseStatus_t stat; 2765 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2766 PetscBool compressed; 2767 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2768 PetscInt nx,ny; 2769 #endif 2770 2771 PetscFunctionBegin; 2772 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2773 if (!a->nonzerorowcnt) { 2774 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2775 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2776 PetscFunctionReturn(0); 2777 } 2778 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2779 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2780 if (!trans) { 2781 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2782 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2783 } else { 2784 if (herm || !cusparsestruct->transgen) { 2785 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2786 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2787 } else { 2788 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);} 2789 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2790 } 2791 } 2792 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2793 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2794 2795 try { 2796 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2797 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2798 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2799 2800 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2801 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2802 /* z = A x + beta y. 2803 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2804 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2805 */ 2806 xptr = xarray; 2807 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2808 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2809 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2810 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2811 allocated to accommodate different uses. So we get the length info directly from mat. 2812 */ 2813 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2814 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2815 nx = mat->num_cols; 2816 ny = mat->num_rows; 2817 } 2818 #endif 2819 } else { 2820 /* z = A^T x + beta y 2821 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2822 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2823 */ 2824 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2825 dptr = zarray; 2826 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2827 if (compressed) { /* Scatter x to work vector */ 2828 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2829 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2830 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2831 VecCUDAEqualsReverse()); 2832 } 2833 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2834 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2835 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2836 nx = mat->num_rows; 2837 ny = mat->num_cols; 2838 } 2839 #endif 2840 } 2841 2842 /* csr_spmv does y = alpha op(A) x + beta y */ 2843 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2844 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2845 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2846 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2847 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2848 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2849 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2850 matstruct->matDescr, 2851 matstruct->cuSpMV[opA].vecXDescr, beta, 2852 matstruct->cuSpMV[opA].vecYDescr, 2853 cusparse_scalartype, 2854 cusparsestruct->spmvAlg, 2855 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2856 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2857 2858 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2859 } else { 2860 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2861 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2862 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2863 } 2864 2865 stat = cusparseSpMV(cusparsestruct->handle, opA, 2866 matstruct->alpha_one, 2867 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */ 2868 matstruct->cuSpMV[opA].vecXDescr, 2869 beta, 2870 matstruct->cuSpMV[opA].vecYDescr, 2871 cusparse_scalartype, 2872 cusparsestruct->spmvAlg, 2873 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2874 #else 2875 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2876 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2877 mat->num_rows, mat->num_cols, 2878 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2879 mat->values->data().get(), mat->row_offsets->data().get(), 2880 mat->column_indices->data().get(), xptr, beta, 2881 dptr);CHKERRCUSPARSE(stat); 2882 #endif 2883 } else { 2884 if (cusparsestruct->nrows) { 2885 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2886 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2887 #else 2888 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2889 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2890 matstruct->alpha_one, matstruct->descr, hybMat, 2891 xptr, beta, 2892 dptr);CHKERRCUSPARSE(stat); 2893 #endif 2894 } 2895 } 2896 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2897 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2898 2899 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2900 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2901 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2902 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2903 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2904 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2905 } 2906 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2907 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2908 } 2909 2910 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2911 if (compressed) { 2912 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2913 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2914 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2915 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2916 VecCUDAPlusEquals()); 2917 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2918 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2919 } 2920 } else { 2921 if (yy && yy != zz) { 2922 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2923 } 2924 } 2925 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2926 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2927 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2928 } catch(char *ex) { 2929 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2930 } 2931 if (yy) { 2932 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2933 } else { 2934 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2935 } 2936 PetscFunctionReturn(0); 2937 } 2938 2939 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2940 { 2941 PetscErrorCode ierr; 2942 2943 PetscFunctionBegin; 2944 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2945 PetscFunctionReturn(0); 2946 } 2947 2948 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2949 { 2950 PetscErrorCode ierr; 2951 PetscSplitCSRDataStructure *d_mat = NULL; 2952 PetscFunctionBegin; 2953 if (A->factortype == MAT_FACTOR_NONE) { 2954 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2955 } 2956 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2957 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2958 if (d_mat) { 2959 A->offloadmask = PETSC_OFFLOAD_GPU; 2960 } 2961 2962 PetscFunctionReturn(0); 2963 } 2964 2965 /* --------------------------------------------------------------------------------*/ 2966 /*@ 2967 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2968 (the default parallel PETSc format). This matrix will ultimately pushed down 2969 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2970 assembly performance the user should preallocate the matrix storage by setting 2971 the parameter nz (or the array nnz). By setting these parameters accurately, 2972 performance during matrix assembly can be increased by more than a factor of 50. 2973 2974 Collective 2975 2976 Input Parameters: 2977 + comm - MPI communicator, set to PETSC_COMM_SELF 2978 . m - number of rows 2979 . n - number of columns 2980 . nz - number of nonzeros per row (same for all rows) 2981 - nnz - array containing the number of nonzeros in the various rows 2982 (possibly different for each row) or NULL 2983 2984 Output Parameter: 2985 . A - the matrix 2986 2987 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2988 MatXXXXSetPreallocation() paradgm instead of this routine directly. 2989 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 2990 2991 Notes: 2992 If nnz is given then nz is ignored 2993 2994 The AIJ format (also called the Yale sparse matrix format or 2995 compressed row storage), is fully compatible with standard Fortran 77 2996 storage. That is, the stored row and column indices can begin at 2997 either one (as in Fortran) or zero. See the users' manual for details. 2998 2999 Specify the preallocated storage with either nz or nnz (not both). 3000 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3001 allocation. For large problems you MUST preallocate memory or you 3002 will get TERRIBLE performance, see the users' manual chapter on matrices. 3003 3004 By default, this format uses inodes (identical nodes) when possible, to 3005 improve numerical efficiency of matrix-vector products and solves. We 3006 search for consecutive rows with the same nonzero structure, thereby 3007 reusing matrix information to achieve increased efficiency. 3008 3009 Level: intermediate 3010 3011 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3012 @*/ 3013 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3014 { 3015 PetscErrorCode ierr; 3016 3017 PetscFunctionBegin; 3018 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3019 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3020 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3021 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3022 PetscFunctionReturn(0); 3023 } 3024 3025 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3026 { 3027 PetscErrorCode ierr; 3028 PetscSplitCSRDataStructure *d_mat = NULL; 3029 3030 PetscFunctionBegin; 3031 if (A->factortype == MAT_FACTOR_NONE) { 3032 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 3033 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 3034 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3035 } else { 3036 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3037 } 3038 if (d_mat) { 3039 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3040 cudaError_t err; 3041 PetscSplitCSRDataStructure h_mat; 3042 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 3043 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 3044 if (a->compressedrow.use) { 3045 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 3046 } 3047 err = cudaFree(d_mat);CHKERRCUDA(err); 3048 } 3049 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3050 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3051 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3052 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3053 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3054 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3055 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3056 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3057 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3058 PetscFunctionReturn(0); 3059 } 3060 3061 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3062 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3063 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3064 { 3065 PetscErrorCode ierr; 3066 3067 PetscFunctionBegin; 3068 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3069 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3070 PetscFunctionReturn(0); 3071 } 3072 3073 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3074 { 3075 PetscErrorCode ierr; 3076 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3077 Mat_SeqAIJCUSPARSE *cy; 3078 Mat_SeqAIJCUSPARSE *cx; 3079 PetscScalar *ay; 3080 const PetscScalar *ax; 3081 CsrMatrix *csry,*csrx; 3082 cudaError_t cerr; 3083 3084 PetscFunctionBegin; 3085 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3086 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3087 if (X->ops->axpy != Y->ops->axpy) { 3088 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3089 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3090 PetscFunctionReturn(0); 3091 } 3092 /* if we are here, it means both matrices are bound to GPU */ 3093 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3094 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3095 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3096 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3097 csry = (CsrMatrix*)cy->mat->mat; 3098 csrx = (CsrMatrix*)cx->mat->mat; 3099 /* see if we can turn this into a cublas axpy */ 3100 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3101 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3102 if (eq) { 3103 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3104 } 3105 if (eq) str = SAME_NONZERO_PATTERN; 3106 } 3107 /* spgeam is buggy with one column */ 3108 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3109 3110 if (str == SUBSET_NONZERO_PATTERN) { 3111 cusparseStatus_t stat; 3112 PetscScalar b = 1.0; 3113 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3114 size_t bufferSize; 3115 void *buffer; 3116 #endif 3117 3118 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3119 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3120 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3121 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3122 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3123 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3124 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3125 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3126 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3127 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3128 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3129 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3130 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3131 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3132 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3133 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3134 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3135 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3136 #else 3137 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3138 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3139 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3140 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3141 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3142 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3143 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3144 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3145 #endif 3146 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3147 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3148 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3149 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3150 } else if (str == SAME_NONZERO_PATTERN) { 3151 cublasHandle_t cublasv2handle; 3152 cublasStatus_t berr; 3153 PetscBLASInt one = 1, bnz = 1; 3154 3155 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3156 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3157 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3158 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3159 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3160 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3161 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3162 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3163 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3164 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3165 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3166 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3167 } else { 3168 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3169 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3170 } 3171 PetscFunctionReturn(0); 3172 } 3173 3174 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3175 { 3176 PetscErrorCode ierr; 3177 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3178 PetscScalar *ay; 3179 cudaError_t cerr; 3180 cublasHandle_t cublasv2handle; 3181 cublasStatus_t berr; 3182 PetscBLASInt one = 1, bnz = 1; 3183 3184 PetscFunctionBegin; 3185 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3186 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3187 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3188 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3189 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3190 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3191 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3192 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3193 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3194 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3195 PetscFunctionReturn(0); 3196 } 3197 3198 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3199 { 3200 PetscErrorCode ierr; 3201 PetscBool both = PETSC_FALSE; 3202 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3203 3204 PetscFunctionBegin; 3205 if (A->factortype == MAT_FACTOR_NONE) { 3206 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3207 if (spptr->mat) { 3208 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3209 if (matrix->values) { 3210 both = PETSC_TRUE; 3211 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3212 } 3213 } 3214 if (spptr->matTranspose) { 3215 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3216 if (matrix->values) { 3217 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3218 } 3219 } 3220 } 3221 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3222 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3223 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3224 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3225 else A->offloadmask = PETSC_OFFLOAD_CPU; 3226 3227 PetscFunctionReturn(0); 3228 } 3229 3230 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3231 { 3232 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3233 PetscErrorCode ierr; 3234 3235 PetscFunctionBegin; 3236 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3237 if (flg) { 3238 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3239 3240 A->ops->scale = MatScale_SeqAIJ; 3241 A->ops->axpy = MatAXPY_SeqAIJ; 3242 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3243 A->ops->mult = MatMult_SeqAIJ; 3244 A->ops->multadd = MatMultAdd_SeqAIJ; 3245 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3246 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3247 A->ops->multhermitiantranspose = NULL; 3248 A->ops->multhermitiantransposeadd = NULL; 3249 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3250 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3251 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3252 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3253 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3254 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3255 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3256 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3257 } else { 3258 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3259 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3260 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3261 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3262 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3263 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3264 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3265 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3266 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3267 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3268 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3269 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3270 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3271 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3272 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3273 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3274 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3275 } 3276 A->boundtocpu = flg; 3277 a->inode.use = flg; 3278 PetscFunctionReturn(0); 3279 } 3280 3281 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3282 { 3283 PetscErrorCode ierr; 3284 cusparseStatus_t stat; 3285 Mat B; 3286 3287 PetscFunctionBegin; 3288 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3289 if (reuse == MAT_INITIAL_MATRIX) { 3290 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3291 } else if (reuse == MAT_REUSE_MATRIX) { 3292 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3293 } 3294 B = *newmat; 3295 3296 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3297 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3298 3299 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3300 if (B->factortype == MAT_FACTOR_NONE) { 3301 Mat_SeqAIJCUSPARSE *spptr; 3302 3303 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3304 spptr->format = MAT_CUSPARSE_CSR; 3305 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3306 B->spptr = spptr; 3307 spptr->deviceMat = NULL; 3308 } else { 3309 Mat_SeqAIJCUSPARSETriFactors *spptr; 3310 3311 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3312 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3313 B->spptr = spptr; 3314 } 3315 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3316 } 3317 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3318 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3319 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3320 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3321 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3322 3323 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3324 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3325 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3326 PetscFunctionReturn(0); 3327 } 3328 3329 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3330 { 3331 PetscErrorCode ierr; 3332 3333 PetscFunctionBegin; 3334 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3335 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3336 ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr); 3337 ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr); 3338 ierr = PetscOptionsEnd();CHKERRQ(ierr); 3339 PetscFunctionReturn(0); 3340 } 3341 3342 /*MC 3343 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3344 3345 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3346 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3347 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3348 3349 Options Database Keys: 3350 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3351 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3352 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3353 3354 Level: beginner 3355 3356 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3357 M*/ 3358 3359 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 3360 3361 3362 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3363 { 3364 PetscErrorCode ierr; 3365 3366 PetscFunctionBegin; 3367 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3368 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3369 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3370 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3371 PetscFunctionReturn(0); 3372 } 3373 3374 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3375 { 3376 PetscErrorCode ierr; 3377 cusparseStatus_t stat; 3378 3379 PetscFunctionBegin; 3380 if (*cusparsestruct) { 3381 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3382 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3383 delete (*cusparsestruct)->workVector; 3384 delete (*cusparsestruct)->rowoffsets_gpu; 3385 delete (*cusparsestruct)->cooPerm; 3386 delete (*cusparsestruct)->cooPerm_a; 3387 delete (*cusparsestruct)->csr2csc_i; 3388 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3389 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3390 } 3391 PetscFunctionReturn(0); 3392 } 3393 3394 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3395 { 3396 PetscFunctionBegin; 3397 if (*mat) { 3398 delete (*mat)->values; 3399 delete (*mat)->column_indices; 3400 delete (*mat)->row_offsets; 3401 delete *mat; 3402 *mat = 0; 3403 } 3404 PetscFunctionReturn(0); 3405 } 3406 3407 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3408 { 3409 cusparseStatus_t stat; 3410 PetscErrorCode ierr; 3411 3412 PetscFunctionBegin; 3413 if (*trifactor) { 3414 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3415 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3416 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3417 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3418 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3419 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3420 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3421 #endif 3422 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3423 } 3424 PetscFunctionReturn(0); 3425 } 3426 3427 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3428 { 3429 CsrMatrix *mat; 3430 cusparseStatus_t stat; 3431 cudaError_t err; 3432 3433 PetscFunctionBegin; 3434 if (*matstruct) { 3435 if ((*matstruct)->mat) { 3436 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3437 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3438 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3439 #else 3440 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3441 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3442 #endif 3443 } else { 3444 mat = (CsrMatrix*)(*matstruct)->mat; 3445 CsrMatrix_Destroy(&mat); 3446 } 3447 } 3448 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3449 delete (*matstruct)->cprowIndices; 3450 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3451 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3452 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3453 3454 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3455 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3456 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3457 for (int i=0; i<3; i++) { 3458 if (mdata->cuSpMV[i].initialized) { 3459 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3460 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3461 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3462 } 3463 } 3464 #endif 3465 delete *matstruct; 3466 *matstruct = NULL; 3467 } 3468 PetscFunctionReturn(0); 3469 } 3470 3471 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3472 { 3473 PetscErrorCode ierr; 3474 3475 PetscFunctionBegin; 3476 if (*trifactors) { 3477 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3478 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3479 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3480 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3481 delete (*trifactors)->rpermIndices; 3482 delete (*trifactors)->cpermIndices; 3483 delete (*trifactors)->workVector; 3484 (*trifactors)->rpermIndices = NULL; 3485 (*trifactors)->cpermIndices = NULL; 3486 (*trifactors)->workVector = NULL; 3487 } 3488 PetscFunctionReturn(0); 3489 } 3490 3491 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3492 { 3493 PetscErrorCode ierr; 3494 cusparseHandle_t handle; 3495 cusparseStatus_t stat; 3496 3497 PetscFunctionBegin; 3498 if (*trifactors) { 3499 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3500 if (handle = (*trifactors)->handle) { 3501 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3502 } 3503 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3504 } 3505 PetscFunctionReturn(0); 3506 } 3507 3508 struct IJCompare 3509 { 3510 __host__ __device__ 3511 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3512 { 3513 if (t1.get<0>() < t2.get<0>()) return true; 3514 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3515 return false; 3516 } 3517 }; 3518 3519 struct IJEqual 3520 { 3521 __host__ __device__ 3522 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3523 { 3524 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3525 return true; 3526 } 3527 }; 3528 3529 struct IJDiff 3530 { 3531 __host__ __device__ 3532 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3533 { 3534 return t1 == t2 ? 0 : 1; 3535 } 3536 }; 3537 3538 struct IJSum 3539 { 3540 __host__ __device__ 3541 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3542 { 3543 return t1||t2; 3544 } 3545 }; 3546 3547 #include <thrust/iterator/discard_iterator.h> 3548 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3549 { 3550 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3551 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3552 THRUSTARRAY *cooPerm_v = NULL; 3553 thrust::device_ptr<const PetscScalar> d_v; 3554 CsrMatrix *matrix; 3555 PetscErrorCode ierr; 3556 cudaError_t cerr; 3557 PetscInt n; 3558 3559 PetscFunctionBegin; 3560 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3561 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3562 if (!cusp->cooPerm) { 3563 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3564 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3565 PetscFunctionReturn(0); 3566 } 3567 matrix = (CsrMatrix*)cusp->mat->mat; 3568 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3569 if (!v) { 3570 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3571 goto finalize; 3572 } 3573 n = cusp->cooPerm->size(); 3574 if (isCudaMem(v)) { 3575 d_v = thrust::device_pointer_cast(v); 3576 } else { 3577 cooPerm_v = new THRUSTARRAY(n); 3578 cooPerm_v->assign(v,v+n); 3579 d_v = cooPerm_v->data(); 3580 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3581 } 3582 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3583 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3584 if (cusp->cooPerm_a) { 3585 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3586 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3587 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3588 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3589 delete cooPerm_w; 3590 } else { 3591 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3592 matrix->values->begin())); 3593 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3594 matrix->values->end())); 3595 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 3596 } 3597 } else { 3598 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3599 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3600 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3601 } else { 3602 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3603 matrix->values->begin())); 3604 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3605 matrix->values->end())); 3606 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3607 } 3608 } 3609 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3610 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3611 finalize: 3612 delete cooPerm_v; 3613 A->offloadmask = PETSC_OFFLOAD_GPU; 3614 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3615 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3616 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3617 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3618 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3619 a->reallocs = 0; 3620 A->info.mallocs += 0; 3621 A->info.nz_unneeded = 0; 3622 A->assembled = A->was_assembled = PETSC_TRUE; 3623 A->num_ass++; 3624 PetscFunctionReturn(0); 3625 } 3626 3627 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3628 { 3629 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3630 PetscErrorCode ierr; 3631 3632 PetscFunctionBegin; 3633 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3634 if (!cusp) PetscFunctionReturn(0); 3635 if (destroy) { 3636 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3637 delete cusp->csr2csc_i; 3638 cusp->csr2csc_i = NULL; 3639 } 3640 cusp->transupdated = PETSC_FALSE; 3641 PetscFunctionReturn(0); 3642 } 3643 3644 #include <thrust/binary_search.h> 3645 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3646 { 3647 PetscErrorCode ierr; 3648 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3649 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3650 PetscInt cooPerm_n, nzr = 0; 3651 cudaError_t cerr; 3652 3653 PetscFunctionBegin; 3654 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3655 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3656 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3657 if (n != cooPerm_n) { 3658 delete cusp->cooPerm; 3659 delete cusp->cooPerm_a; 3660 cusp->cooPerm = NULL; 3661 cusp->cooPerm_a = NULL; 3662 } 3663 if (n) { 3664 THRUSTINTARRAY d_i(n); 3665 THRUSTINTARRAY d_j(n); 3666 THRUSTINTARRAY ii(A->rmap->n); 3667 3668 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3669 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3670 3671 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3672 d_i.assign(coo_i,coo_i+n); 3673 d_j.assign(coo_j,coo_j+n); 3674 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3675 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3676 3677 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3678 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3679 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 3680 *cusp->cooPerm_a = d_i; 3681 THRUSTINTARRAY w = d_j; 3682 3683 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 3684 if (nekey == ekey) { /* all entries are unique */ 3685 delete cusp->cooPerm_a; 3686 cusp->cooPerm_a = NULL; 3687 } else { /* I couldn't come up with a more elegant algorithm */ 3688 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 3689 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 3690 (*cusp->cooPerm_a)[0] = 0; 3691 w[0] = 0; 3692 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 3693 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 3694 } 3695 thrust::counting_iterator<PetscInt> search_begin(0); 3696 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 3697 search_begin, search_begin + A->rmap->n, 3698 ii.begin()); 3699 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3700 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3701 3702 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3703 a->singlemalloc = PETSC_FALSE; 3704 a->free_a = PETSC_TRUE; 3705 a->free_ij = PETSC_TRUE; 3706 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3707 a->i[0] = 0; 3708 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3709 a->nz = a->maxnz = a->i[A->rmap->n]; 3710 a->rmax = 0; 3711 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3712 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3713 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3714 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3715 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3716 for (PetscInt i = 0; i < A->rmap->n; i++) { 3717 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3718 nzr += (PetscInt)!!(nnzr); 3719 a->ilen[i] = a->imax[i] = nnzr; 3720 a->rmax = PetscMax(a->rmax,nnzr); 3721 } 3722 a->nonzerorowcnt = nzr; 3723 A->preallocated = PETSC_TRUE; 3724 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3725 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3726 } else { 3727 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3728 } 3729 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3730 3731 /* We want to allocate the CUSPARSE struct for matvec now. 3732 The code is so convoluted now that I prefer to copy zeros */ 3733 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3734 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3735 A->offloadmask = PETSC_OFFLOAD_CPU; 3736 A->nonzerostate++; 3737 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3738 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 3739 3740 A->assembled = PETSC_FALSE; 3741 A->was_assembled = PETSC_FALSE; 3742 PetscFunctionReturn(0); 3743 } 3744 3745 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 3746 { 3747 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3748 CsrMatrix *csr; 3749 PetscErrorCode ierr; 3750 3751 PetscFunctionBegin; 3752 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3753 PetscValidPointer(a,2); 3754 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3755 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3756 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3757 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3758 csr = (CsrMatrix*)cusp->mat->mat; 3759 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3760 *a = csr->values->data().get(); 3761 PetscFunctionReturn(0); 3762 } 3763 3764 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 3765 { 3766 PetscFunctionBegin; 3767 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3768 PetscValidPointer(a,2); 3769 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3770 *a = NULL; 3771 PetscFunctionReturn(0); 3772 } 3773 3774 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 3775 { 3776 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3777 CsrMatrix *csr; 3778 PetscErrorCode ierr; 3779 3780 PetscFunctionBegin; 3781 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3782 PetscValidPointer(a,2); 3783 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3784 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3785 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3786 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3787 csr = (CsrMatrix*)cusp->mat->mat; 3788 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3789 *a = csr->values->data().get(); 3790 A->offloadmask = PETSC_OFFLOAD_GPU; 3791 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3792 PetscFunctionReturn(0); 3793 } 3794 3795 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 3796 { 3797 PetscErrorCode ierr; 3798 3799 PetscFunctionBegin; 3800 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3801 PetscValidPointer(a,2); 3802 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3803 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3804 *a = NULL; 3805 PetscFunctionReturn(0); 3806 } 3807 3808 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 3809 { 3810 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3811 CsrMatrix *csr; 3812 PetscErrorCode ierr; 3813 3814 PetscFunctionBegin; 3815 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3816 PetscValidPointer(a,2); 3817 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3818 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3819 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3820 csr = (CsrMatrix*)cusp->mat->mat; 3821 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3822 *a = csr->values->data().get(); 3823 A->offloadmask = PETSC_OFFLOAD_GPU; 3824 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3825 PetscFunctionReturn(0); 3826 } 3827 3828 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 3829 { 3830 PetscErrorCode ierr; 3831 3832 PetscFunctionBegin; 3833 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3834 PetscValidPointer(a,2); 3835 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3836 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3837 *a = NULL; 3838 PetscFunctionReturn(0); 3839 } 3840 3841 struct IJCompare4 3842 { 3843 __host__ __device__ 3844 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 3845 { 3846 if (t1.get<0>() < t2.get<0>()) return true; 3847 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3848 return false; 3849 } 3850 }; 3851 3852 struct Shift 3853 { 3854 int _shift; 3855 3856 Shift(int shift) : _shift(shift) {} 3857 __host__ __device__ 3858 inline int operator() (const int &c) 3859 { 3860 return c + _shift; 3861 } 3862 }; 3863 3864 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */ 3865 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 3866 { 3867 PetscErrorCode ierr; 3868 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 3869 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 3870 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 3871 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3872 PetscInt Annz,Bnnz; 3873 cusparseStatus_t stat; 3874 PetscInt i,m,n,zero = 0; 3875 cudaError_t cerr; 3876 3877 PetscFunctionBegin; 3878 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3879 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 3880 PetscValidPointer(C,4); 3881 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3882 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 3883 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 3884 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 3885 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3886 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3887 if (reuse == MAT_INITIAL_MATRIX) { 3888 m = A->rmap->n; 3889 n = A->cmap->n + B->cmap->n; 3890 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 3891 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 3892 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3893 c = (Mat_SeqAIJ*)(*C)->data; 3894 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3895 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3896 Ccsr = new CsrMatrix; 3897 Cmat->cprowIndices = NULL; 3898 c->compressedrow.use = PETSC_FALSE; 3899 c->compressedrow.nrows = 0; 3900 c->compressedrow.i = NULL; 3901 c->compressedrow.rindex = NULL; 3902 Ccusp->workVector = NULL; 3903 Ccusp->nrows = m; 3904 Ccusp->mat = Cmat; 3905 Ccusp->mat->mat = Ccsr; 3906 Ccsr->num_rows = m; 3907 Ccsr->num_cols = n; 3908 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 3909 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3910 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3911 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3912 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3913 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3914 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3915 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3916 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3917 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3918 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 3919 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 3920 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr); 3921 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3922 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3923 3924 Acsr = (CsrMatrix*)Acusp->mat->mat; 3925 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 3926 Annz = (PetscInt)Acsr->column_indices->size(); 3927 Bnnz = (PetscInt)Bcsr->column_indices->size(); 3928 c->nz = Annz + Bnnz; 3929 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 3930 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3931 Ccsr->values = new THRUSTARRAY(c->nz); 3932 Ccsr->num_entries = c->nz; 3933 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 3934 if (c->nz) { 3935 auto Acoo = new THRUSTINTARRAY32(Annz); 3936 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 3937 auto Ccoo = new THRUSTINTARRAY32(c->nz); 3938 THRUSTINTARRAY32 *Aroff,*Broff; 3939 3940 if (a->compressedrow.use) { /* need full row offset */ 3941 if (!Acusp->rowoffsets_gpu) { 3942 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 3943 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 3944 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3945 } 3946 Aroff = Acusp->rowoffsets_gpu; 3947 } else Aroff = Acsr->row_offsets; 3948 if (b->compressedrow.use) { /* need full row offset */ 3949 if (!Bcusp->rowoffsets_gpu) { 3950 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3951 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 3952 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3953 } 3954 Broff = Bcusp->rowoffsets_gpu; 3955 } else Broff = Bcsr->row_offsets; 3956 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3957 stat = cusparseXcsr2coo(Acusp->handle, 3958 Aroff->data().get(), 3959 Annz, 3960 m, 3961 Acoo->data().get(), 3962 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3963 stat = cusparseXcsr2coo(Bcusp->handle, 3964 Broff->data().get(), 3965 Bnnz, 3966 m, 3967 Bcoo->data().get(), 3968 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3969 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 3970 auto Aperm = thrust::make_constant_iterator(1); 3971 auto Bperm = thrust::make_constant_iterator(0); 3972 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 3973 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 3974 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 3975 #else 3976 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 3977 auto Bcib = Bcsr->column_indices->begin(); 3978 auto Bcie = Bcsr->column_indices->end(); 3979 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 3980 #endif 3981 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 3982 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 3983 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 3984 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 3985 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 3986 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 3987 auto p1 = Ccusp->cooPerm->begin(); 3988 auto p2 = Ccusp->cooPerm->begin(); 3989 thrust::advance(p2,Annz); 3990 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 3991 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 3992 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 3993 #endif 3994 auto cci = thrust::make_counting_iterator(zero); 3995 auto cce = thrust::make_counting_iterator(c->nz); 3996 #if 0 //Errors on SUMMIT cuda 11.1.0 3997 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 3998 #else 3999 auto pred = thrust::identity<int>(); 4000 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4001 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4002 #endif 4003 stat = cusparseXcoo2csr(Ccusp->handle, 4004 Ccoo->data().get(), 4005 c->nz, 4006 m, 4007 Ccsr->row_offsets->data().get(), 4008 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4009 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4010 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4011 delete wPerm; 4012 delete Acoo; 4013 delete Bcoo; 4014 delete Ccoo; 4015 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4016 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4017 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4018 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4019 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4020 #endif 4021 if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */ 4022 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4023 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4024 CsrMatrix *CcsrT = new CsrMatrix; 4025 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4026 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4027 4028 Ccusp->transgen = PETSC_TRUE; 4029 Ccusp->transupdated = PETSC_TRUE; 4030 Ccusp->rowoffsets_gpu = NULL; 4031 CmatT->cprowIndices = NULL; 4032 CmatT->mat = CcsrT; 4033 CcsrT->num_rows = n; 4034 CcsrT->num_cols = m; 4035 CcsrT->num_entries = c->nz; 4036 4037 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4038 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4039 CcsrT->values = new THRUSTARRAY(c->nz); 4040 4041 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4042 auto rT = CcsrT->row_offsets->begin(); 4043 if (AT) { 4044 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4045 thrust::advance(rT,-1); 4046 } 4047 if (BT) { 4048 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4049 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4050 thrust::copy(titb,tite,rT); 4051 } 4052 auto cT = CcsrT->column_indices->begin(); 4053 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4054 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4055 auto vT = CcsrT->values->begin(); 4056 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4057 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4058 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4059 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4060 4061 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4062 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4063 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4064 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4065 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4066 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4067 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4068 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4069 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4070 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4071 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4072 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4073 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4074 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4075 #endif 4076 Ccusp->matTranspose = CmatT; 4077 } 4078 } 4079 4080 c->singlemalloc = PETSC_FALSE; 4081 c->free_a = PETSC_TRUE; 4082 c->free_ij = PETSC_TRUE; 4083 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4084 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4085 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4086 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4087 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4088 ii = *Ccsr->row_offsets; 4089 jj = *Ccsr->column_indices; 4090 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4091 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4092 } else { 4093 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4094 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4095 } 4096 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4097 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4098 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4099 c->maxnz = c->nz; 4100 c->nonzerorowcnt = 0; 4101 c->rmax = 0; 4102 for (i = 0; i < m; i++) { 4103 const PetscInt nn = c->i[i+1] - c->i[i]; 4104 c->ilen[i] = c->imax[i] = nn; 4105 c->nonzerorowcnt += (PetscInt)!!nn; 4106 c->rmax = PetscMax(c->rmax,nn); 4107 } 4108 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4109 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4110 (*C)->nonzerostate++; 4111 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4112 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4113 Ccusp->nonzerostate = (*C)->nonzerostate; 4114 (*C)->preallocated = PETSC_TRUE; 4115 } else { 4116 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4117 c = (Mat_SeqAIJ*)(*C)->data; 4118 if (c->nz) { 4119 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4120 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4121 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4122 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4123 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4124 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4125 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4126 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4127 Acsr = (CsrMatrix*)Acusp->mat->mat; 4128 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4129 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4130 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4131 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4132 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4133 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4134 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4135 auto pmid = Ccusp->cooPerm->begin(); 4136 thrust::advance(pmid,Acsr->num_entries); 4137 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4138 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4139 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4140 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4141 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4142 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4143 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4144 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4145 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4146 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4147 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4148 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4149 if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) { 4150 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4151 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4152 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4153 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4154 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4155 auto vT = CcsrT->values->begin(); 4156 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4157 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4158 Ccusp->transupdated = PETSC_TRUE; 4159 } 4160 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4161 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4162 } 4163 } 4164 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4165 (*C)->assembled = PETSC_TRUE; 4166 (*C)->was_assembled = PETSC_FALSE; 4167 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4168 PetscFunctionReturn(0); 4169 } 4170 4171 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4172 { 4173 PetscErrorCode ierr; 4174 bool dmem; 4175 const PetscScalar *av; 4176 cudaError_t cerr; 4177 4178 PetscFunctionBegin; 4179 dmem = isCudaMem(v); 4180 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4181 if (n && idx) { 4182 THRUSTINTARRAY widx(n); 4183 widx.assign(idx,idx+n); 4184 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4185 4186 THRUSTARRAY *w = NULL; 4187 thrust::device_ptr<PetscScalar> dv; 4188 if (dmem) { 4189 dv = thrust::device_pointer_cast(v); 4190 } else { 4191 w = new THRUSTARRAY(n); 4192 dv = w->data(); 4193 } 4194 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4195 4196 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4197 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4198 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4199 if (w) { 4200 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4201 } 4202 delete w; 4203 } else { 4204 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4205 } 4206 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4207 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4208 PetscFunctionReturn(0); 4209 } 4210