1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/async/for_each.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 76 77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 82 83 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 85 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 86 87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 89 90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 91 92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 93 { 94 cusparseStatus_t stat; 95 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 96 97 PetscFunctionBegin; 98 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 99 cusparsestruct->stream = stream; 100 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 101 PetscFunctionReturn(0); 102 } 103 104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 105 { 106 cusparseStatus_t stat; 107 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 108 109 PetscFunctionBegin; 110 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 111 if (cusparsestruct->handle != handle) { 112 if (cusparsestruct->handle) { 113 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 114 } 115 cusparsestruct->handle = handle; 116 } 117 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 118 PetscFunctionReturn(0); 119 } 120 121 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 122 { 123 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 124 PetscBool flg; 125 PetscErrorCode ierr; 126 127 PetscFunctionBegin; 128 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 129 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 130 if (cusparsestruct->handle) cusparsestruct->handle = 0; 131 PetscFunctionReturn(0); 132 } 133 134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 135 { 136 PetscFunctionBegin; 137 *type = MATSOLVERCUSPARSE; 138 PetscFunctionReturn(0); 139 } 140 141 /*MC 142 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 143 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 144 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 145 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 146 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 147 algorithms are not recommended. This class does NOT support direct solver operations. 148 149 Level: beginner 150 151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 152 M*/ 153 154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 155 { 156 PetscErrorCode ierr; 157 PetscInt n = A->rmap->n; 158 159 PetscFunctionBegin; 160 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 161 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 162 (*B)->factortype = ftype; 163 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 164 165 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 166 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 167 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 168 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 169 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 170 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 171 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 172 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 173 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 174 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 175 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 176 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 177 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 178 179 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 180 (*B)->canuseordering = PETSC_TRUE; 181 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 182 PetscFunctionReturn(0); 183 } 184 185 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 186 { 187 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 188 189 PetscFunctionBegin; 190 switch (op) { 191 case MAT_CUSPARSE_MULT: 192 cusparsestruct->format = format; 193 break; 194 case MAT_CUSPARSE_ALL: 195 cusparsestruct->format = format; 196 break; 197 default: 198 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 199 } 200 PetscFunctionReturn(0); 201 } 202 203 /*@ 204 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 205 operation. Only the MatMult operation can use different GPU storage formats 206 for MPIAIJCUSPARSE matrices. 207 Not Collective 208 209 Input Parameters: 210 + A - Matrix of type SEQAIJCUSPARSE 211 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 212 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 213 214 Output Parameter: 215 216 Level: intermediate 217 218 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 219 @*/ 220 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 221 { 222 PetscErrorCode ierr; 223 224 PetscFunctionBegin; 225 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 226 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 227 PetscFunctionReturn(0); 228 } 229 230 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 231 { 232 PetscErrorCode ierr; 233 234 PetscFunctionBegin; 235 switch (op) { 236 case MAT_FORM_EXPLICIT_TRANSPOSE: 237 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 238 if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 239 A->form_explicit_transpose = flg; 240 break; 241 default: 242 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 243 break; 244 } 245 PetscFunctionReturn(0); 246 } 247 248 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 249 250 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 251 { 252 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 253 IS isrow = b->row,iscol = b->col; 254 PetscBool row_identity,col_identity; 255 PetscErrorCode ierr; 256 257 PetscFunctionBegin; 258 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 259 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 260 B->offloadmask = PETSC_OFFLOAD_CPU; 261 /* determine which version of MatSolve needs to be used. */ 262 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 263 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 264 if (row_identity && col_identity) { 265 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 266 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 267 B->ops->matsolve = NULL; 268 B->ops->matsolvetranspose = NULL; 269 } else { 270 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 271 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 272 B->ops->matsolve = NULL; 273 B->ops->matsolvetranspose = NULL; 274 } 275 276 /* get the triangular factors */ 277 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 278 PetscFunctionReturn(0); 279 } 280 281 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 282 { 283 PetscErrorCode ierr; 284 MatCUSPARSEStorageFormat format; 285 PetscBool flg; 286 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 287 288 PetscFunctionBegin; 289 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 290 if (A->factortype == MAT_FACTOR_NONE) { 291 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 292 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 293 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 294 295 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 296 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 297 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 298 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 299 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 300 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 301 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 302 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 303 if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 304 #else 305 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 306 #endif 307 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 308 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 309 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 310 311 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 312 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 313 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 314 #endif 315 } 316 ierr = PetscOptionsTail();CHKERRQ(ierr); 317 PetscFunctionReturn(0); 318 } 319 320 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 321 { 322 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 323 PetscErrorCode ierr; 324 325 PetscFunctionBegin; 326 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 327 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 328 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 329 PetscFunctionReturn(0); 330 } 331 332 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 333 { 334 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 335 PetscErrorCode ierr; 336 337 PetscFunctionBegin; 338 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 339 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 340 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 341 PetscFunctionReturn(0); 342 } 343 344 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 345 { 346 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 347 PetscErrorCode ierr; 348 349 PetscFunctionBegin; 350 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 351 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 352 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 353 PetscFunctionReturn(0); 354 } 355 356 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 357 { 358 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 363 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 364 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 365 PetscFunctionReturn(0); 366 } 367 368 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 369 { 370 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 371 PetscInt n = A->rmap->n; 372 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 373 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 374 cusparseStatus_t stat; 375 const PetscInt *ai = a->i,*aj = a->j,*vi; 376 const MatScalar *aa = a->a,*v; 377 PetscInt *AiLo, *AjLo; 378 PetscInt i,nz, nzLower, offset, rowOffset; 379 PetscErrorCode ierr; 380 cudaError_t cerr; 381 382 PetscFunctionBegin; 383 if (!n) PetscFunctionReturn(0); 384 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 385 try { 386 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 387 nzLower=n+ai[n]-ai[1]; 388 if (!loTriFactor) { 389 PetscScalar *AALo; 390 391 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 392 393 /* Allocate Space for the lower triangular matrix */ 394 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 395 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 396 397 /* Fill the lower triangular matrix */ 398 AiLo[0] = (PetscInt) 0; 399 AiLo[n] = nzLower; 400 AjLo[0] = (PetscInt) 0; 401 AALo[0] = (MatScalar) 1.0; 402 v = aa; 403 vi = aj; 404 offset = 1; 405 rowOffset= 1; 406 for (i=1; i<n; i++) { 407 nz = ai[i+1] - ai[i]; 408 /* additional 1 for the term on the diagonal */ 409 AiLo[i] = rowOffset; 410 rowOffset += nz+1; 411 412 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 413 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 414 415 offset += nz; 416 AjLo[offset] = (PetscInt) i; 417 AALo[offset] = (MatScalar) 1.0; 418 offset += 1; 419 420 v += nz; 421 vi += nz; 422 } 423 424 /* allocate space for the triangular factor information */ 425 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 426 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 427 /* Create the matrix description */ 428 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 429 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 430 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 431 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 432 #else 433 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 434 #endif 435 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 436 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 437 438 /* set the operation */ 439 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 440 441 /* set the matrix */ 442 loTriFactor->csrMat = new CsrMatrix; 443 loTriFactor->csrMat->num_rows = n; 444 loTriFactor->csrMat->num_cols = n; 445 loTriFactor->csrMat->num_entries = nzLower; 446 447 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 448 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 449 450 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 451 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 452 453 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 454 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 455 456 /* Create the solve analysis information */ 457 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 458 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 459 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 460 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 461 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 462 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 463 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 464 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 465 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 466 #endif 467 468 /* perform the solve analysis */ 469 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 470 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 471 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 472 loTriFactor->csrMat->column_indices->data().get(), 473 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 474 loTriFactor->solveInfo, 475 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 476 #else 477 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 478 #endif 479 cerr = WaitForCUDA();CHKERRCUDA(cerr); 480 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 481 482 /* assign the pointer */ 483 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 484 loTriFactor->AA_h = AALo; 485 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 486 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 487 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 488 } else { /* update values only */ 489 if (!loTriFactor->AA_h) { 490 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 491 } 492 /* Fill the lower triangular matrix */ 493 loTriFactor->AA_h[0] = 1.0; 494 v = aa; 495 vi = aj; 496 offset = 1; 497 for (i=1; i<n; i++) { 498 nz = ai[i+1] - ai[i]; 499 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 500 offset += nz; 501 loTriFactor->AA_h[offset] = 1.0; 502 offset += 1; 503 v += nz; 504 } 505 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 506 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 507 } 508 } catch(char *ex) { 509 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 510 } 511 } 512 PetscFunctionReturn(0); 513 } 514 515 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 516 { 517 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 518 PetscInt n = A->rmap->n; 519 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 520 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 521 cusparseStatus_t stat; 522 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 523 const MatScalar *aa = a->a,*v; 524 PetscInt *AiUp, *AjUp; 525 PetscInt i,nz, nzUpper, offset; 526 PetscErrorCode ierr; 527 cudaError_t cerr; 528 529 PetscFunctionBegin; 530 if (!n) PetscFunctionReturn(0); 531 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 532 try { 533 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 534 nzUpper = adiag[0]-adiag[n]; 535 if (!upTriFactor) { 536 PetscScalar *AAUp; 537 538 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 539 540 /* Allocate Space for the upper triangular matrix */ 541 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 542 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 543 544 /* Fill the upper triangular matrix */ 545 AiUp[0]=(PetscInt) 0; 546 AiUp[n]=nzUpper; 547 offset = nzUpper; 548 for (i=n-1; i>=0; i--) { 549 v = aa + adiag[i+1] + 1; 550 vi = aj + adiag[i+1] + 1; 551 552 /* number of elements NOT on the diagonal */ 553 nz = adiag[i] - adiag[i+1]-1; 554 555 /* decrement the offset */ 556 offset -= (nz+1); 557 558 /* first, set the diagonal elements */ 559 AjUp[offset] = (PetscInt) i; 560 AAUp[offset] = (MatScalar)1./v[nz]; 561 AiUp[i] = AiUp[i+1] - (nz+1); 562 563 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 564 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 565 } 566 567 /* allocate space for the triangular factor information */ 568 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 569 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 570 571 /* Create the matrix description */ 572 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 573 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 574 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 575 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 576 #else 577 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 578 #endif 579 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 580 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 581 582 /* set the operation */ 583 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 584 585 /* set the matrix */ 586 upTriFactor->csrMat = new CsrMatrix; 587 upTriFactor->csrMat->num_rows = n; 588 upTriFactor->csrMat->num_cols = n; 589 upTriFactor->csrMat->num_entries = nzUpper; 590 591 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 592 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 593 594 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 595 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 596 597 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 598 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 599 600 /* Create the solve analysis information */ 601 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 602 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 603 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 604 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 605 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 606 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 607 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 608 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 609 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 610 #endif 611 612 /* perform the solve analysis */ 613 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 614 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 615 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 616 upTriFactor->csrMat->column_indices->data().get(), 617 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 618 upTriFactor->solveInfo, 619 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 620 #else 621 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 622 #endif 623 cerr = WaitForCUDA();CHKERRCUDA(cerr); 624 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 625 626 /* assign the pointer */ 627 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 628 upTriFactor->AA_h = AAUp; 629 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 630 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 631 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 632 } else { 633 if (!upTriFactor->AA_h) { 634 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 635 } 636 /* Fill the upper triangular matrix */ 637 offset = nzUpper; 638 for (i=n-1; i>=0; i--) { 639 v = aa + adiag[i+1] + 1; 640 641 /* number of elements NOT on the diagonal */ 642 nz = adiag[i] - adiag[i+1]-1; 643 644 /* decrement the offset */ 645 offset -= (nz+1); 646 647 /* first, set the diagonal elements */ 648 upTriFactor->AA_h[offset] = 1./v[nz]; 649 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 650 } 651 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 652 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 653 } 654 } catch(char *ex) { 655 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 656 } 657 } 658 PetscFunctionReturn(0); 659 } 660 661 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 662 { 663 PetscErrorCode ierr; 664 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 665 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 666 IS isrow = a->row,iscol = a->icol; 667 PetscBool row_identity,col_identity; 668 PetscInt n = A->rmap->n; 669 670 PetscFunctionBegin; 671 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 672 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 673 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 674 675 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 676 cusparseTriFactors->nnz=a->nz; 677 678 A->offloadmask = PETSC_OFFLOAD_BOTH; 679 /* lower triangular indices */ 680 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 681 if (!row_identity && !cusparseTriFactors->rpermIndices) { 682 const PetscInt *r; 683 684 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 685 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 686 cusparseTriFactors->rpermIndices->assign(r, r+n); 687 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 688 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 689 } 690 691 /* upper triangular indices */ 692 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 693 if (!col_identity && !cusparseTriFactors->cpermIndices) { 694 const PetscInt *c; 695 696 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 697 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 698 cusparseTriFactors->cpermIndices->assign(c, c+n); 699 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 700 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 701 } 702 PetscFunctionReturn(0); 703 } 704 705 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 706 { 707 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 708 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 709 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 710 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 711 cusparseStatus_t stat; 712 PetscErrorCode ierr; 713 cudaError_t cerr; 714 PetscInt *AiUp, *AjUp; 715 PetscScalar *AAUp; 716 PetscScalar *AALo; 717 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 718 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 719 const PetscInt *ai = b->i,*aj = b->j,*vj; 720 const MatScalar *aa = b->a,*v; 721 722 PetscFunctionBegin; 723 if (!n) PetscFunctionReturn(0); 724 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 725 try { 726 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 727 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 728 if (!upTriFactor && !loTriFactor) { 729 /* Allocate Space for the upper triangular matrix */ 730 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 731 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 732 733 /* Fill the upper triangular matrix */ 734 AiUp[0]=(PetscInt) 0; 735 AiUp[n]=nzUpper; 736 offset = 0; 737 for (i=0; i<n; i++) { 738 /* set the pointers */ 739 v = aa + ai[i]; 740 vj = aj + ai[i]; 741 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 742 743 /* first, set the diagonal elements */ 744 AjUp[offset] = (PetscInt) i; 745 AAUp[offset] = (MatScalar)1.0/v[nz]; 746 AiUp[i] = offset; 747 AALo[offset] = (MatScalar)1.0/v[nz]; 748 749 offset+=1; 750 if (nz>0) { 751 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 752 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 753 for (j=offset; j<offset+nz; j++) { 754 AAUp[j] = -AAUp[j]; 755 AALo[j] = AAUp[j]/v[nz]; 756 } 757 offset+=nz; 758 } 759 } 760 761 /* allocate space for the triangular factor information */ 762 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 763 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 764 765 /* Create the matrix description */ 766 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 767 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 768 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 769 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 770 #else 771 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 772 #endif 773 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 774 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 775 776 /* set the matrix */ 777 upTriFactor->csrMat = new CsrMatrix; 778 upTriFactor->csrMat->num_rows = A->rmap->n; 779 upTriFactor->csrMat->num_cols = A->cmap->n; 780 upTriFactor->csrMat->num_entries = a->nz; 781 782 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 783 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 784 785 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 786 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 787 788 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 789 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 790 791 /* set the operation */ 792 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 793 794 /* Create the solve analysis information */ 795 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 796 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 797 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 798 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 799 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 800 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 801 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 802 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 803 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 804 #endif 805 806 /* perform the solve analysis */ 807 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 808 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 809 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 810 upTriFactor->csrMat->column_indices->data().get(), 811 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 812 upTriFactor->solveInfo, 813 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 814 #else 815 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 816 #endif 817 cerr = WaitForCUDA();CHKERRCUDA(cerr); 818 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 819 820 /* assign the pointer */ 821 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 822 823 /* allocate space for the triangular factor information */ 824 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 825 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 826 827 /* Create the matrix description */ 828 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 829 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 830 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 831 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 832 #else 833 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 834 #endif 835 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 836 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 837 838 /* set the operation */ 839 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 840 841 /* set the matrix */ 842 loTriFactor->csrMat = new CsrMatrix; 843 loTriFactor->csrMat->num_rows = A->rmap->n; 844 loTriFactor->csrMat->num_cols = A->cmap->n; 845 loTriFactor->csrMat->num_entries = a->nz; 846 847 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 848 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 849 850 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 851 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 852 853 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 854 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 855 856 /* Create the solve analysis information */ 857 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 858 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 859 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 860 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 861 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 862 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 863 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 864 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 865 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 866 #endif 867 868 /* perform the solve analysis */ 869 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 870 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 871 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 872 loTriFactor->csrMat->column_indices->data().get(), 873 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 874 loTriFactor->solveInfo, 875 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 876 #else 877 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 878 #endif 879 cerr = WaitForCUDA();CHKERRCUDA(cerr); 880 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 881 882 /* assign the pointer */ 883 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 884 885 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 886 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 887 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 888 } else { 889 /* Fill the upper triangular matrix */ 890 offset = 0; 891 for (i=0; i<n; i++) { 892 /* set the pointers */ 893 v = aa + ai[i]; 894 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 895 896 /* first, set the diagonal elements */ 897 AAUp[offset] = 1.0/v[nz]; 898 AALo[offset] = 1.0/v[nz]; 899 900 offset+=1; 901 if (nz>0) { 902 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 903 for (j=offset; j<offset+nz; j++) { 904 AAUp[j] = -AAUp[j]; 905 AALo[j] = AAUp[j]/v[nz]; 906 } 907 offset+=nz; 908 } 909 } 910 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 911 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 912 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 913 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 914 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 915 } 916 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 917 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 918 } catch(char *ex) { 919 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 920 } 921 } 922 PetscFunctionReturn(0); 923 } 924 925 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 926 { 927 PetscErrorCode ierr; 928 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 929 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 930 IS ip = a->row; 931 PetscBool perm_identity; 932 PetscInt n = A->rmap->n; 933 934 PetscFunctionBegin; 935 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 936 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 937 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 938 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 939 940 A->offloadmask = PETSC_OFFLOAD_BOTH; 941 942 /* lower triangular indices */ 943 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 944 if (!perm_identity) { 945 IS iip; 946 const PetscInt *irip,*rip; 947 948 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 949 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 950 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 951 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 952 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 953 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 954 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 955 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 956 ierr = ISDestroy(&iip);CHKERRQ(ierr); 957 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 958 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 959 } 960 PetscFunctionReturn(0); 961 } 962 963 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 964 { 965 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 966 IS ip = b->row; 967 PetscBool perm_identity; 968 PetscErrorCode ierr; 969 970 PetscFunctionBegin; 971 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 972 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 973 B->offloadmask = PETSC_OFFLOAD_CPU; 974 /* determine which version of MatSolve needs to be used. */ 975 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 976 if (perm_identity) { 977 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 978 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 979 B->ops->matsolve = NULL; 980 B->ops->matsolvetranspose = NULL; 981 } else { 982 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 983 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 984 B->ops->matsolve = NULL; 985 B->ops->matsolvetranspose = NULL; 986 } 987 988 /* get the triangular factors */ 989 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 990 PetscFunctionReturn(0); 991 } 992 993 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 994 { 995 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 996 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 997 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 998 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 999 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1000 cusparseStatus_t stat; 1001 cusparseIndexBase_t indexBase; 1002 cusparseMatrixType_t matrixType; 1003 cusparseFillMode_t fillMode; 1004 cusparseDiagType_t diagType; 1005 cudaError_t cerr; 1006 PetscErrorCode ierr; 1007 1008 PetscFunctionBegin; 1009 /* allocate space for the transpose of the lower triangular factor */ 1010 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1011 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1012 1013 /* set the matrix descriptors of the lower triangular factor */ 1014 matrixType = cusparseGetMatType(loTriFactor->descr); 1015 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1016 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1017 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1018 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1019 1020 /* Create the matrix description */ 1021 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1022 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1023 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1024 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1025 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1026 1027 /* set the operation */ 1028 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1029 1030 /* allocate GPU space for the CSC of the lower triangular factor*/ 1031 loTriFactorT->csrMat = new CsrMatrix; 1032 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1033 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1034 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1035 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1036 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1037 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1038 1039 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1040 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1041 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1042 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1043 loTriFactor->csrMat->values->data().get(), 1044 loTriFactor->csrMat->row_offsets->data().get(), 1045 loTriFactor->csrMat->column_indices->data().get(), 1046 loTriFactorT->csrMat->values->data().get(), 1047 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1048 CUSPARSE_ACTION_NUMERIC,indexBase, 1049 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1050 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1051 #endif 1052 1053 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1054 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1055 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1056 loTriFactor->csrMat->values->data().get(), 1057 loTriFactor->csrMat->row_offsets->data().get(), 1058 loTriFactor->csrMat->column_indices->data().get(), 1059 loTriFactorT->csrMat->values->data().get(), 1060 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1061 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1062 CUSPARSE_ACTION_NUMERIC, indexBase, 1063 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1064 #else 1065 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1066 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1067 #endif 1068 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1069 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1070 1071 /* Create the solve analysis information */ 1072 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1073 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1074 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1075 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1076 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1077 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1078 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1079 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1080 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1081 #endif 1082 1083 /* perform the solve analysis */ 1084 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1085 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1086 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1087 loTriFactorT->csrMat->column_indices->data().get(), 1088 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1089 loTriFactorT->solveInfo, 1090 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1091 #else 1092 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1093 #endif 1094 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1095 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1096 1097 /* assign the pointer */ 1098 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1099 1100 /*********************************************/ 1101 /* Now the Transpose of the Upper Tri Factor */ 1102 /*********************************************/ 1103 1104 /* allocate space for the transpose of the upper triangular factor */ 1105 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1106 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1107 1108 /* set the matrix descriptors of the upper triangular factor */ 1109 matrixType = cusparseGetMatType(upTriFactor->descr); 1110 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1111 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1112 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1113 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1114 1115 /* Create the matrix description */ 1116 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1117 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1118 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1119 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1120 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1121 1122 /* set the operation */ 1123 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1124 1125 /* allocate GPU space for the CSC of the upper triangular factor*/ 1126 upTriFactorT->csrMat = new CsrMatrix; 1127 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1128 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1129 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1130 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1131 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1132 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1133 1134 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1135 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1136 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1137 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1138 upTriFactor->csrMat->values->data().get(), 1139 upTriFactor->csrMat->row_offsets->data().get(), 1140 upTriFactor->csrMat->column_indices->data().get(), 1141 upTriFactorT->csrMat->values->data().get(), 1142 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1143 CUSPARSE_ACTION_NUMERIC,indexBase, 1144 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1145 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1146 #endif 1147 1148 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1149 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1150 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1151 upTriFactor->csrMat->values->data().get(), 1152 upTriFactor->csrMat->row_offsets->data().get(), 1153 upTriFactor->csrMat->column_indices->data().get(), 1154 upTriFactorT->csrMat->values->data().get(), 1155 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1156 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1157 CUSPARSE_ACTION_NUMERIC, indexBase, 1158 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1159 #else 1160 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1161 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1162 #endif 1163 1164 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1165 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1166 1167 /* Create the solve analysis information */ 1168 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1169 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1170 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1171 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1172 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1173 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1174 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1175 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1176 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1177 #endif 1178 1179 /* perform the solve analysis */ 1180 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1181 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1182 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1183 upTriFactorT->csrMat->column_indices->data().get(), 1184 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1185 upTriFactorT->solveInfo, 1186 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1187 #else 1188 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1189 #endif 1190 1191 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1192 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1193 1194 /* assign the pointer */ 1195 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1196 PetscFunctionReturn(0); 1197 } 1198 1199 struct PetscScalarToPetscInt 1200 { 1201 __host__ __device__ 1202 PetscInt operator()(PetscScalar s) 1203 { 1204 return (PetscInt)PetscRealPart(s); 1205 } 1206 }; 1207 1208 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A) 1209 { 1210 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1211 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1212 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1213 cusparseStatus_t stat; 1214 cusparseIndexBase_t indexBase; 1215 cudaError_t err; 1216 PetscErrorCode ierr; 1217 1218 PetscFunctionBegin; 1219 if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 1220 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1221 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1222 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1223 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1224 if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1225 if (A->transupdated) PetscFunctionReturn(0); 1226 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1227 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1228 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1229 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1230 } 1231 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1232 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1233 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1234 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1235 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1236 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1237 1238 /* set alpha and beta */ 1239 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1240 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1241 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1242 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1243 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1244 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1245 1246 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1247 CsrMatrix *matrixT = new CsrMatrix; 1248 matstructT->mat = matrixT; 1249 matrixT->num_rows = A->cmap->n; 1250 matrixT->num_cols = A->rmap->n; 1251 matrixT->num_entries = a->nz; 1252 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1253 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1254 matrixT->values = new THRUSTARRAY(a->nz); 1255 1256 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1257 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1258 1259 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1260 stat = cusparseCreateCsr(&matstructT->matDescr, 1261 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1262 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1263 matrixT->values->data().get(), 1264 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1265 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1266 #endif 1267 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1268 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1269 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1270 #else 1271 CsrMatrix *temp = new CsrMatrix; 1272 CsrMatrix *tempT = new CsrMatrix; 1273 /* First convert HYB to CSR */ 1274 temp->num_rows = A->rmap->n; 1275 temp->num_cols = A->cmap->n; 1276 temp->num_entries = a->nz; 1277 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1278 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1279 temp->values = new THRUSTARRAY(a->nz); 1280 1281 stat = cusparse_hyb2csr(cusparsestruct->handle, 1282 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1283 temp->values->data().get(), 1284 temp->row_offsets->data().get(), 1285 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1286 1287 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1288 tempT->num_rows = A->rmap->n; 1289 tempT->num_cols = A->cmap->n; 1290 tempT->num_entries = a->nz; 1291 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1292 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1293 tempT->values = new THRUSTARRAY(a->nz); 1294 1295 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1296 temp->num_cols, temp->num_entries, 1297 temp->values->data().get(), 1298 temp->row_offsets->data().get(), 1299 temp->column_indices->data().get(), 1300 tempT->values->data().get(), 1301 tempT->column_indices->data().get(), 1302 tempT->row_offsets->data().get(), 1303 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1304 1305 /* Last, convert CSC to HYB */ 1306 cusparseHybMat_t hybMat; 1307 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1308 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1309 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1310 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1311 matstructT->descr, tempT->values->data().get(), 1312 tempT->row_offsets->data().get(), 1313 tempT->column_indices->data().get(), 1314 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1315 1316 /* assign the pointer */ 1317 matstructT->mat = hybMat; 1318 A->transupdated = PETSC_TRUE; 1319 /* delete temporaries */ 1320 if (tempT) { 1321 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1322 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1323 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1324 delete (CsrMatrix*) tempT; 1325 } 1326 if (temp) { 1327 if (temp->values) delete (THRUSTARRAY*) temp->values; 1328 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1329 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1330 delete (CsrMatrix*) temp; 1331 } 1332 #endif 1333 } 1334 } 1335 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1336 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1337 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1338 if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1339 if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1340 if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1341 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1342 if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1343 if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1344 if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1345 if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1346 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1347 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1348 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1349 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1350 } 1351 if (!cusparsestruct->csr2csc_i) { 1352 THRUSTARRAY csr2csc_a(matrix->num_entries); 1353 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1354 1355 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1356 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1357 void *csr2cscBuffer; 1358 size_t csr2cscBufferSize; 1359 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1360 A->cmap->n, matrix->num_entries, 1361 matrix->values->data().get(), 1362 cusparsestruct->rowoffsets_gpu->data().get(), 1363 matrix->column_indices->data().get(), 1364 matrixT->values->data().get(), 1365 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1366 CUSPARSE_ACTION_NUMERIC,indexBase, 1367 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1368 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1369 #endif 1370 1371 if (matrix->num_entries) { 1372 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1373 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1374 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1375 1376 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1377 should be filled with indexBase. So I just take a shortcut here. 1378 */ 1379 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1380 A->cmap->n,matrix->num_entries, 1381 csr2csc_a.data().get(), 1382 cusparsestruct->rowoffsets_gpu->data().get(), 1383 matrix->column_indices->data().get(), 1384 matrixT->values->data().get(), 1385 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1386 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1387 CUSPARSE_ACTION_NUMERIC,indexBase, 1388 cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1389 #else 1390 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1391 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1392 #endif 1393 } else { 1394 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1395 } 1396 1397 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1398 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1399 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1400 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1401 #endif 1402 } 1403 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1404 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1405 matrixT->values->begin())); 1406 } 1407 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1408 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1409 /* the compressed row indices is not used for matTranspose */ 1410 matstructT->cprowIndices = NULL; 1411 /* assign the pointer */ 1412 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1413 A->transupdated = PETSC_TRUE; 1414 PetscFunctionReturn(0); 1415 } 1416 1417 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1418 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1419 { 1420 PetscInt n = xx->map->n; 1421 const PetscScalar *barray; 1422 PetscScalar *xarray; 1423 thrust::device_ptr<const PetscScalar> bGPU; 1424 thrust::device_ptr<PetscScalar> xGPU; 1425 cusparseStatus_t stat; 1426 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1427 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1428 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1429 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1430 PetscErrorCode ierr; 1431 1432 PetscFunctionBegin; 1433 /* Analyze the matrix and create the transpose ... on the fly */ 1434 if (!loTriFactorT && !upTriFactorT) { 1435 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1436 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1437 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1438 } 1439 1440 /* Get the GPU pointers */ 1441 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1442 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1443 xGPU = thrust::device_pointer_cast(xarray); 1444 bGPU = thrust::device_pointer_cast(barray); 1445 1446 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1447 /* First, reorder with the row permutation */ 1448 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1449 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1450 xGPU); 1451 1452 /* First, solve U */ 1453 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1454 upTriFactorT->csrMat->num_rows, 1455 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1456 upTriFactorT->csrMat->num_entries, 1457 #endif 1458 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1459 upTriFactorT->csrMat->values->data().get(), 1460 upTriFactorT->csrMat->row_offsets->data().get(), 1461 upTriFactorT->csrMat->column_indices->data().get(), 1462 upTriFactorT->solveInfo, 1463 xarray, 1464 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1465 tempGPU->data().get(), 1466 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1467 #else 1468 tempGPU->data().get());CHKERRCUSPARSE(stat); 1469 #endif 1470 1471 /* Then, solve L */ 1472 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1473 loTriFactorT->csrMat->num_rows, 1474 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1475 loTriFactorT->csrMat->num_entries, 1476 #endif 1477 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1478 loTriFactorT->csrMat->values->data().get(), 1479 loTriFactorT->csrMat->row_offsets->data().get(), 1480 loTriFactorT->csrMat->column_indices->data().get(), 1481 loTriFactorT->solveInfo, 1482 tempGPU->data().get(), 1483 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1484 xarray, 1485 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1486 #else 1487 xarray);CHKERRCUSPARSE(stat); 1488 #endif 1489 1490 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1491 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1492 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1493 tempGPU->begin()); 1494 1495 /* Copy the temporary to the full solution. */ 1496 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1497 1498 /* restore */ 1499 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1500 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1501 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1502 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1503 PetscFunctionReturn(0); 1504 } 1505 1506 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1507 { 1508 const PetscScalar *barray; 1509 PetscScalar *xarray; 1510 cusparseStatus_t stat; 1511 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1512 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1513 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1514 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1515 PetscErrorCode ierr; 1516 1517 PetscFunctionBegin; 1518 /* Analyze the matrix and create the transpose ... on the fly */ 1519 if (!loTriFactorT && !upTriFactorT) { 1520 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1521 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1522 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1523 } 1524 1525 /* Get the GPU pointers */ 1526 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1527 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1528 1529 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1530 /* First, solve U */ 1531 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1532 upTriFactorT->csrMat->num_rows, 1533 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1534 upTriFactorT->csrMat->num_entries, 1535 #endif 1536 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1537 upTriFactorT->csrMat->values->data().get(), 1538 upTriFactorT->csrMat->row_offsets->data().get(), 1539 upTriFactorT->csrMat->column_indices->data().get(), 1540 upTriFactorT->solveInfo, 1541 barray, 1542 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1543 tempGPU->data().get(), 1544 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1545 #else 1546 tempGPU->data().get());CHKERRCUSPARSE(stat); 1547 #endif 1548 1549 /* Then, solve L */ 1550 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1551 loTriFactorT->csrMat->num_rows, 1552 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1553 loTriFactorT->csrMat->num_entries, 1554 #endif 1555 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1556 loTriFactorT->csrMat->values->data().get(), 1557 loTriFactorT->csrMat->row_offsets->data().get(), 1558 loTriFactorT->csrMat->column_indices->data().get(), 1559 loTriFactorT->solveInfo, 1560 tempGPU->data().get(), 1561 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1562 xarray, 1563 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1564 #else 1565 xarray);CHKERRCUSPARSE(stat); 1566 #endif 1567 1568 /* restore */ 1569 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1570 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1571 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1572 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1573 PetscFunctionReturn(0); 1574 } 1575 1576 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1577 { 1578 const PetscScalar *barray; 1579 PetscScalar *xarray; 1580 thrust::device_ptr<const PetscScalar> bGPU; 1581 thrust::device_ptr<PetscScalar> xGPU; 1582 cusparseStatus_t stat; 1583 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1584 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1585 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1586 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1587 PetscErrorCode ierr; 1588 1589 PetscFunctionBegin; 1590 1591 /* Get the GPU pointers */ 1592 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1593 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1594 xGPU = thrust::device_pointer_cast(xarray); 1595 bGPU = thrust::device_pointer_cast(barray); 1596 1597 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1598 /* First, reorder with the row permutation */ 1599 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1600 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1601 tempGPU->begin()); 1602 1603 /* Next, solve L */ 1604 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1605 loTriFactor->csrMat->num_rows, 1606 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1607 loTriFactor->csrMat->num_entries, 1608 #endif 1609 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1610 loTriFactor->csrMat->values->data().get(), 1611 loTriFactor->csrMat->row_offsets->data().get(), 1612 loTriFactor->csrMat->column_indices->data().get(), 1613 loTriFactor->solveInfo, 1614 tempGPU->data().get(), 1615 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1616 xarray, 1617 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1618 #else 1619 xarray);CHKERRCUSPARSE(stat); 1620 #endif 1621 1622 /* Then, solve U */ 1623 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1624 upTriFactor->csrMat->num_rows, 1625 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1626 upTriFactor->csrMat->num_entries, 1627 #endif 1628 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1629 upTriFactor->csrMat->values->data().get(), 1630 upTriFactor->csrMat->row_offsets->data().get(), 1631 upTriFactor->csrMat->column_indices->data().get(), 1632 upTriFactor->solveInfo,xarray, 1633 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1634 tempGPU->data().get(), 1635 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1636 #else 1637 tempGPU->data().get());CHKERRCUSPARSE(stat); 1638 #endif 1639 1640 /* Last, reorder with the column permutation */ 1641 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1642 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1643 xGPU); 1644 1645 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1646 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1647 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1648 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1649 PetscFunctionReturn(0); 1650 } 1651 1652 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1653 { 1654 const PetscScalar *barray; 1655 PetscScalar *xarray; 1656 cusparseStatus_t stat; 1657 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1658 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1659 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1660 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1661 PetscErrorCode ierr; 1662 1663 PetscFunctionBegin; 1664 /* Get the GPU pointers */ 1665 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1666 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1667 1668 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1669 /* First, solve L */ 1670 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1671 loTriFactor->csrMat->num_rows, 1672 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1673 loTriFactor->csrMat->num_entries, 1674 #endif 1675 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1676 loTriFactor->csrMat->values->data().get(), 1677 loTriFactor->csrMat->row_offsets->data().get(), 1678 loTriFactor->csrMat->column_indices->data().get(), 1679 loTriFactor->solveInfo, 1680 barray, 1681 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1682 tempGPU->data().get(), 1683 loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1684 #else 1685 tempGPU->data().get());CHKERRCUSPARSE(stat); 1686 #endif 1687 1688 /* Next, solve U */ 1689 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1690 upTriFactor->csrMat->num_rows, 1691 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1692 upTriFactor->csrMat->num_entries, 1693 #endif 1694 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1695 upTriFactor->csrMat->values->data().get(), 1696 upTriFactor->csrMat->row_offsets->data().get(), 1697 upTriFactor->csrMat->column_indices->data().get(), 1698 upTriFactor->solveInfo, 1699 tempGPU->data().get(), 1700 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1701 xarray, 1702 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1703 #else 1704 xarray);CHKERRCUSPARSE(stat); 1705 #endif 1706 1707 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1708 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1709 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1710 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1711 PetscFunctionReturn(0); 1712 } 1713 1714 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1715 { 1716 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1717 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1718 cudaError_t cerr; 1719 PetscErrorCode ierr; 1720 1721 PetscFunctionBegin; 1722 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1723 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1724 1725 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1726 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1727 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1728 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1729 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1730 A->offloadmask = PETSC_OFFLOAD_BOTH; 1731 } 1732 PetscFunctionReturn(0); 1733 } 1734 1735 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1736 { 1737 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1738 PetscErrorCode ierr; 1739 1740 PetscFunctionBegin; 1741 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1742 *array = a->a; 1743 A->offloadmask = PETSC_OFFLOAD_CPU; 1744 PetscFunctionReturn(0); 1745 } 1746 1747 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1748 { 1749 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1750 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1751 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1752 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1753 PetscErrorCode ierr; 1754 cusparseStatus_t stat; 1755 PetscBool both = PETSC_TRUE; 1756 cudaError_t err; 1757 1758 PetscFunctionBegin; 1759 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1760 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1761 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1762 CsrMatrix *matrix; 1763 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1764 1765 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1766 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1767 matrix->values->assign(a->a, a->a+a->nz); 1768 err = WaitForCUDA();CHKERRCUDA(err); 1769 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1770 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1771 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1772 } else { 1773 PetscInt nnz; 1774 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1775 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1776 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1777 delete cusparsestruct->workVector; 1778 delete cusparsestruct->rowoffsets_gpu; 1779 cusparsestruct->workVector = NULL; 1780 cusparsestruct->rowoffsets_gpu = NULL; 1781 try { 1782 if (a->compressedrow.use) { 1783 m = a->compressedrow.nrows; 1784 ii = a->compressedrow.i; 1785 ridx = a->compressedrow.rindex; 1786 } else { 1787 m = A->rmap->n; 1788 ii = a->i; 1789 ridx = NULL; 1790 } 1791 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1792 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1793 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1794 else nnz = a->nz; 1795 1796 /* create cusparse matrix */ 1797 cusparsestruct->nrows = m; 1798 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1799 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1800 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1801 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1802 1803 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1804 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1805 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1806 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1807 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1808 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1809 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1810 1811 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1812 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1813 /* set the matrix */ 1814 CsrMatrix *mat= new CsrMatrix; 1815 mat->num_rows = m; 1816 mat->num_cols = A->cmap->n; 1817 mat->num_entries = nnz; 1818 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1819 mat->row_offsets->assign(ii, ii + m+1); 1820 1821 mat->column_indices = new THRUSTINTARRAY32(nnz); 1822 mat->column_indices->assign(a->j, a->j+nnz); 1823 1824 mat->values = new THRUSTARRAY(nnz); 1825 if (a->a) mat->values->assign(a->a, a->a+nnz); 1826 1827 /* assign the pointer */ 1828 matstruct->mat = mat; 1829 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1830 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1831 stat = cusparseCreateCsr(&matstruct->matDescr, 1832 mat->num_rows, mat->num_cols, mat->num_entries, 1833 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1834 mat->values->data().get(), 1835 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1836 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1837 } 1838 #endif 1839 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1840 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1841 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1842 #else 1843 CsrMatrix *mat= new CsrMatrix; 1844 mat->num_rows = m; 1845 mat->num_cols = A->cmap->n; 1846 mat->num_entries = nnz; 1847 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1848 mat->row_offsets->assign(ii, ii + m+1); 1849 1850 mat->column_indices = new THRUSTINTARRAY32(nnz); 1851 mat->column_indices->assign(a->j, a->j+nnz); 1852 1853 mat->values = new THRUSTARRAY(nnz); 1854 if (a->a) mat->values->assign(a->a, a->a+nnz); 1855 1856 cusparseHybMat_t hybMat; 1857 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1858 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1859 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1860 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1861 matstruct->descr, mat->values->data().get(), 1862 mat->row_offsets->data().get(), 1863 mat->column_indices->data().get(), 1864 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1865 /* assign the pointer */ 1866 matstruct->mat = hybMat; 1867 1868 if (mat) { 1869 if (mat->values) delete (THRUSTARRAY*)mat->values; 1870 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1871 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1872 delete (CsrMatrix*)mat; 1873 } 1874 #endif 1875 } 1876 1877 /* assign the compressed row indices */ 1878 if (a->compressedrow.use) { 1879 cusparsestruct->workVector = new THRUSTARRAY(m); 1880 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1881 matstruct->cprowIndices->assign(ridx,ridx+m); 1882 tmp = m; 1883 } else { 1884 cusparsestruct->workVector = NULL; 1885 matstruct->cprowIndices = NULL; 1886 tmp = 0; 1887 } 1888 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1889 1890 /* assign the pointer */ 1891 cusparsestruct->mat = matstruct; 1892 } catch(char *ex) { 1893 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1894 } 1895 err = WaitForCUDA();CHKERRCUDA(err); 1896 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1897 cusparsestruct->nonzerostate = A->nonzerostate; 1898 } 1899 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1900 } 1901 PetscFunctionReturn(0); 1902 } 1903 1904 struct VecCUDAPlusEquals 1905 { 1906 template <typename Tuple> 1907 __host__ __device__ 1908 void operator()(Tuple t) 1909 { 1910 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1911 } 1912 }; 1913 1914 struct VecCUDAEquals 1915 { 1916 template <typename Tuple> 1917 __host__ __device__ 1918 void operator()(Tuple t) 1919 { 1920 thrust::get<1>(t) = thrust::get<0>(t); 1921 } 1922 }; 1923 1924 struct VecCUDAEqualsReverse 1925 { 1926 template <typename Tuple> 1927 __host__ __device__ 1928 void operator()(Tuple t) 1929 { 1930 thrust::get<0>(t) = thrust::get<1>(t); 1931 } 1932 }; 1933 1934 struct MatMatCusparse { 1935 PetscBool cisdense; 1936 PetscScalar *Bt; 1937 Mat X; 1938 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1939 PetscLogDouble flops; 1940 CsrMatrix *Bcsr; 1941 1942 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1943 cusparseSpMatDescr_t matSpBDescr; 1944 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1945 cusparseDnMatDescr_t matBDescr; 1946 cusparseDnMatDescr_t matCDescr; 1947 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1948 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 1949 void *dBuffer4; 1950 void *dBuffer5; 1951 #endif 1952 size_t mmBufferSize; 1953 void *mmBuffer; 1954 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1955 cusparseSpGEMMDescr_t spgemmDesc; 1956 #endif 1957 }; 1958 1959 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1960 { 1961 PetscErrorCode ierr; 1962 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1963 cudaError_t cerr; 1964 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1965 cusparseStatus_t stat; 1966 #endif 1967 1968 PetscFunctionBegin; 1969 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1970 delete mmdata->Bcsr; 1971 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1972 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1973 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1974 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1975 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1976 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 1977 if (mmdata->dBuffer4) { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); } 1978 if (mmdata->dBuffer5) { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); } 1979 #endif 1980 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1981 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1982 #endif 1983 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1984 ierr = PetscFree(data);CHKERRQ(ierr); 1985 PetscFunctionReturn(0); 1986 } 1987 1988 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1989 1990 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1991 { 1992 Mat_Product *product = C->product; 1993 Mat A,B; 1994 PetscInt m,n,blda,clda; 1995 PetscBool flg,biscuda; 1996 Mat_SeqAIJCUSPARSE *cusp; 1997 cusparseStatus_t stat; 1998 cusparseOperation_t opA; 1999 const PetscScalar *barray; 2000 PetscScalar *carray; 2001 PetscErrorCode ierr; 2002 MatMatCusparse *mmdata; 2003 Mat_SeqAIJCUSPARSEMultStruct *mat; 2004 CsrMatrix *csrmat; 2005 2006 PetscFunctionBegin; 2007 MatCheckProduct(C,1); 2008 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2009 mmdata = (MatMatCusparse*)product->data; 2010 A = product->A; 2011 B = product->B; 2012 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2013 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2014 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2015 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2016 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2017 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2018 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2019 switch (product->type) { 2020 case MATPRODUCT_AB: 2021 case MATPRODUCT_PtAP: 2022 mat = cusp->mat; 2023 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2024 m = A->rmap->n; 2025 n = B->cmap->n; 2026 break; 2027 case MATPRODUCT_AtB: 2028 if (!A->form_explicit_transpose) { 2029 mat = cusp->mat; 2030 opA = CUSPARSE_OPERATION_TRANSPOSE; 2031 } else { 2032 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2033 mat = cusp->matTranspose; 2034 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2035 } 2036 m = A->cmap->n; 2037 n = B->cmap->n; 2038 break; 2039 case MATPRODUCT_ABt: 2040 case MATPRODUCT_RARt: 2041 mat = cusp->mat; 2042 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2043 m = A->rmap->n; 2044 n = B->rmap->n; 2045 break; 2046 default: 2047 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2048 } 2049 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2050 csrmat = (CsrMatrix*)mat->mat; 2051 /* if the user passed a CPU matrix, copy the data to the GPU */ 2052 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2053 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2054 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2055 2056 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2057 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2058 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2059 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2060 } else { 2061 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2062 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2063 } 2064 2065 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2066 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2067 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2068 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2069 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2070 size_t mmBufferSize; 2071 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2072 if (!mmdata->matBDescr) { 2073 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2074 mmdata->Blda = blda; 2075 } 2076 2077 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2078 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2079 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2080 mmdata->Clda = clda; 2081 } 2082 2083 if (!mat->matDescr) { 2084 stat = cusparseCreateCsr(&mat->matDescr, 2085 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2086 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2087 csrmat->values->data().get(), 2088 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2089 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2090 } 2091 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2092 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2093 mmdata->matCDescr,cusparse_scalartype, 2094 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2095 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2096 cudaError_t cerr; 2097 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2098 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2099 mmdata->mmBufferSize = mmBufferSize; 2100 } 2101 mmdata->initialized = PETSC_TRUE; 2102 } else { 2103 /* to be safe, always update pointers of the mats */ 2104 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2105 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2106 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2107 } 2108 2109 /* do cusparseSpMM, which supports transpose on B */ 2110 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2111 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2112 mmdata->matCDescr,cusparse_scalartype, 2113 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2114 #else 2115 PetscInt k; 2116 /* cusparseXcsrmm does not support transpose on B */ 2117 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2118 cublasHandle_t cublasv2handle; 2119 cublasStatus_t cerr; 2120 2121 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2122 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2123 B->cmap->n,B->rmap->n, 2124 &PETSC_CUSPARSE_ONE ,barray,blda, 2125 &PETSC_CUSPARSE_ZERO,barray,blda, 2126 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2127 blda = B->cmap->n; 2128 k = B->cmap->n; 2129 } else { 2130 k = B->rmap->n; 2131 } 2132 2133 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2134 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2135 csrmat->num_entries,mat->alpha_one,mat->descr, 2136 csrmat->values->data().get(), 2137 csrmat->row_offsets->data().get(), 2138 csrmat->column_indices->data().get(), 2139 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2140 carray,clda);CHKERRCUSPARSE(stat); 2141 #endif 2142 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2143 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2144 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2145 if (product->type == MATPRODUCT_RARt) { 2146 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2147 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2148 } else if (product->type == MATPRODUCT_PtAP) { 2149 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2150 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2151 } else { 2152 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2153 } 2154 if (mmdata->cisdense) { 2155 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2156 } 2157 if (!biscuda) { 2158 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2159 } 2160 PetscFunctionReturn(0); 2161 } 2162 2163 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2164 { 2165 Mat_Product *product = C->product; 2166 Mat A,B; 2167 PetscInt m,n; 2168 PetscBool cisdense,flg; 2169 PetscErrorCode ierr; 2170 MatMatCusparse *mmdata; 2171 Mat_SeqAIJCUSPARSE *cusp; 2172 2173 PetscFunctionBegin; 2174 MatCheckProduct(C,1); 2175 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2176 A = product->A; 2177 B = product->B; 2178 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2179 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2180 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2181 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2182 switch (product->type) { 2183 case MATPRODUCT_AB: 2184 m = A->rmap->n; 2185 n = B->cmap->n; 2186 break; 2187 case MATPRODUCT_AtB: 2188 m = A->cmap->n; 2189 n = B->cmap->n; 2190 break; 2191 case MATPRODUCT_ABt: 2192 m = A->rmap->n; 2193 n = B->rmap->n; 2194 break; 2195 case MATPRODUCT_PtAP: 2196 m = B->cmap->n; 2197 n = B->cmap->n; 2198 break; 2199 case MATPRODUCT_RARt: 2200 m = B->rmap->n; 2201 n = B->rmap->n; 2202 break; 2203 default: 2204 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2205 } 2206 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2207 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2208 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2209 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2210 2211 /* product data */ 2212 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2213 mmdata->cisdense = cisdense; 2214 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2215 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2216 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2217 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2218 } 2219 #endif 2220 /* for these products we need intermediate storage */ 2221 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2222 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2223 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2224 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2225 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2226 } else { 2227 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2228 } 2229 } 2230 C->product->data = mmdata; 2231 C->product->destroy = MatDestroy_MatMatCusparse; 2232 2233 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2234 PetscFunctionReturn(0); 2235 } 2236 2237 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2238 { 2239 Mat_Product *product = C->product; 2240 Mat A,B; 2241 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2242 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2243 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2244 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2245 PetscBool flg; 2246 PetscErrorCode ierr; 2247 cusparseStatus_t stat; 2248 cudaError_t cerr; 2249 MatProductType ptype; 2250 MatMatCusparse *mmdata; 2251 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2252 cusparseSpMatDescr_t BmatSpDescr; 2253 #endif 2254 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2255 2256 PetscFunctionBegin; 2257 MatCheckProduct(C,1); 2258 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2259 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2260 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2261 mmdata = (MatMatCusparse*)C->product->data; 2262 A = product->A; 2263 B = product->B; 2264 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2265 mmdata->reusesym = PETSC_FALSE; 2266 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2267 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2268 Cmat = Ccusp->mat; 2269 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2270 Ccsr = (CsrMatrix*)Cmat->mat; 2271 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2272 goto finalize; 2273 } 2274 if (!c->nz) goto finalize; 2275 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2276 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2277 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2278 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2279 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2280 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2281 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2282 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2283 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2284 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2285 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2286 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2287 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2288 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2289 2290 ptype = product->type; 2291 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2292 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2293 switch (ptype) { 2294 case MATPRODUCT_AB: 2295 Amat = Acusp->mat; 2296 Bmat = Bcusp->mat; 2297 break; 2298 case MATPRODUCT_AtB: 2299 Amat = Acusp->matTranspose; 2300 Bmat = Bcusp->mat; 2301 break; 2302 case MATPRODUCT_ABt: 2303 Amat = Acusp->mat; 2304 Bmat = Bcusp->matTranspose; 2305 break; 2306 default: 2307 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2308 } 2309 Cmat = Ccusp->mat; 2310 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2311 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2312 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2313 Acsr = (CsrMatrix*)Amat->mat; 2314 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2315 Ccsr = (CsrMatrix*)Cmat->mat; 2316 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2317 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2318 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2319 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2320 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2321 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2322 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2323 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2324 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2325 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2326 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2327 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2328 #else 2329 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2330 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2331 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2332 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2333 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2334 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2335 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2336 #endif 2337 #else 2338 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2339 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2340 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2341 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2342 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2343 #endif 2344 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2345 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2346 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2347 C->offloadmask = PETSC_OFFLOAD_GPU; 2348 finalize: 2349 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2350 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2351 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2352 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2353 c->reallocs = 0; 2354 C->info.mallocs += 0; 2355 C->info.nz_unneeded = 0; 2356 C->assembled = C->was_assembled = PETSC_TRUE; 2357 C->num_ass++; 2358 PetscFunctionReturn(0); 2359 } 2360 2361 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2362 { 2363 Mat_Product *product = C->product; 2364 Mat A,B; 2365 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2366 Mat_SeqAIJ *a,*b,*c; 2367 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2368 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2369 PetscInt i,j,m,n,k; 2370 PetscBool flg; 2371 PetscErrorCode ierr; 2372 cusparseStatus_t stat; 2373 cudaError_t cerr; 2374 MatProductType ptype; 2375 MatMatCusparse *mmdata; 2376 PetscLogDouble flops; 2377 PetscBool biscompressed,ciscompressed; 2378 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2379 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2380 cusparseSpMatDescr_t BmatSpDescr; 2381 #else 2382 int cnz; 2383 #endif 2384 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2385 2386 PetscFunctionBegin; 2387 MatCheckProduct(C,1); 2388 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2389 A = product->A; 2390 B = product->B; 2391 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2392 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2393 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2394 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2395 a = (Mat_SeqAIJ*)A->data; 2396 b = (Mat_SeqAIJ*)B->data; 2397 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2398 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2399 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2400 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2401 2402 /* product data */ 2403 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2404 C->product->data = mmdata; 2405 C->product->destroy = MatDestroy_MatMatCusparse; 2406 2407 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2408 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2409 ptype = product->type; 2410 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2411 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2412 biscompressed = PETSC_FALSE; 2413 ciscompressed = PETSC_FALSE; 2414 switch (ptype) { 2415 case MATPRODUCT_AB: 2416 m = A->rmap->n; 2417 n = B->cmap->n; 2418 k = A->cmap->n; 2419 Amat = Acusp->mat; 2420 Bmat = Bcusp->mat; 2421 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2422 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2423 break; 2424 case MATPRODUCT_AtB: 2425 m = A->cmap->n; 2426 n = B->cmap->n; 2427 k = A->rmap->n; 2428 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2429 Amat = Acusp->matTranspose; 2430 Bmat = Bcusp->mat; 2431 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2432 break; 2433 case MATPRODUCT_ABt: 2434 m = A->rmap->n; 2435 n = B->rmap->n; 2436 k = A->cmap->n; 2437 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 2438 Amat = Acusp->mat; 2439 Bmat = Bcusp->matTranspose; 2440 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2441 break; 2442 default: 2443 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2444 } 2445 2446 /* create cusparse matrix */ 2447 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2448 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2449 c = (Mat_SeqAIJ*)C->data; 2450 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2451 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2452 Ccsr = new CsrMatrix; 2453 2454 c->compressedrow.use = ciscompressed; 2455 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2456 c->compressedrow.nrows = a->compressedrow.nrows; 2457 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2458 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2459 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2460 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2461 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2462 } else { 2463 c->compressedrow.nrows = 0; 2464 c->compressedrow.i = NULL; 2465 c->compressedrow.rindex = NULL; 2466 Ccusp->workVector = NULL; 2467 Cmat->cprowIndices = NULL; 2468 } 2469 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2470 Ccusp->mat = Cmat; 2471 Ccusp->mat->mat = Ccsr; 2472 Ccsr->num_rows = Ccusp->nrows; 2473 Ccsr->num_cols = n; 2474 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2475 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2476 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2477 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2478 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2479 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2480 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2481 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2482 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2483 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2484 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2485 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2486 c->nz = 0; 2487 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2488 Ccsr->values = new THRUSTARRAY(c->nz); 2489 goto finalizesym; 2490 } 2491 2492 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2493 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2494 Acsr = (CsrMatrix*)Amat->mat; 2495 if (!biscompressed) { 2496 Bcsr = (CsrMatrix*)Bmat->mat; 2497 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2498 BmatSpDescr = Bmat->matDescr; 2499 #endif 2500 } else { /* we need to use row offsets for the full matrix */ 2501 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2502 Bcsr = new CsrMatrix; 2503 Bcsr->num_rows = B->rmap->n; 2504 Bcsr->num_cols = cBcsr->num_cols; 2505 Bcsr->num_entries = cBcsr->num_entries; 2506 Bcsr->column_indices = cBcsr->column_indices; 2507 Bcsr->values = cBcsr->values; 2508 if (!Bcusp->rowoffsets_gpu) { 2509 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2510 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2511 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2512 } 2513 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2514 mmdata->Bcsr = Bcsr; 2515 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2516 if (Bcsr->num_rows && Bcsr->num_cols) { 2517 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2518 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2519 Bcsr->values->data().get(), 2520 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2521 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2522 } 2523 BmatSpDescr = mmdata->matSpBDescr; 2524 #endif 2525 } 2526 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2527 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2528 /* precompute flops count */ 2529 if (ptype == MATPRODUCT_AB) { 2530 for (i=0, flops = 0; i<A->rmap->n; i++) { 2531 const PetscInt st = a->i[i]; 2532 const PetscInt en = a->i[i+1]; 2533 for (j=st; j<en; j++) { 2534 const PetscInt brow = a->j[j]; 2535 flops += 2.*(b->i[brow+1] - b->i[brow]); 2536 } 2537 } 2538 } else if (ptype == MATPRODUCT_AtB) { 2539 for (i=0, flops = 0; i<A->rmap->n; i++) { 2540 const PetscInt anzi = a->i[i+1] - a->i[i]; 2541 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2542 flops += (2.*anzi)*bnzi; 2543 } 2544 } else { /* TODO */ 2545 flops = 0.; 2546 } 2547 2548 mmdata->flops = flops; 2549 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2550 2551 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2552 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2553 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2554 NULL, NULL, NULL, 2555 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2556 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2557 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2558 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2559 { 2560 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2561 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2562 */ 2563 void* dBuffer1 = NULL; 2564 void* dBuffer2 = NULL; 2565 void* dBuffer3 = NULL; 2566 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2567 size_t bufferSize1 = 0; 2568 size_t bufferSize2 = 0; 2569 size_t bufferSize3 = 0; 2570 size_t bufferSize4 = 0; 2571 size_t bufferSize5 = 0; 2572 2573 /*----------------------------------------------------------------------*/ 2574 /* ask bufferSize1 bytes for external memory */ 2575 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2576 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2577 &bufferSize1, NULL);CHKERRCUSPARSE(stat); 2578 cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr); 2579 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2580 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2581 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2582 &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat); 2583 2584 /*----------------------------------------------------------------------*/ 2585 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2586 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2587 &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat); 2588 cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr); 2589 cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr); 2590 cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr); 2591 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2592 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2593 &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat); 2594 cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr); 2595 cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr); 2596 2597 /*----------------------------------------------------------------------*/ 2598 /* get matrix C non-zero entries C_nnz1 */ 2599 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2600 c->nz = (PetscInt) C_nnz1; 2601 /* allocate matrix C */ 2602 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2603 Ccsr->values = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2604 /* update matC with the new pointers */ 2605 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2606 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2607 2608 /*----------------------------------------------------------------------*/ 2609 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2610 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2611 &bufferSize5, NULL);CHKERRCUSPARSE(stat); 2612 cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr); 2613 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2614 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2615 &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat); 2616 cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr); 2617 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2618 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2619 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2620 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2621 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr); 2622 } 2623 #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2624 size_t bufSize2; 2625 /* ask bufferSize bytes for external memory */ 2626 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2627 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2628 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2629 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2630 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2631 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2632 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2633 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2634 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2635 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2636 /* ask bufferSize again bytes for external memory */ 2637 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2638 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2639 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2640 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2641 /* The CUSPARSE documentation is not clear, nor the API 2642 We need both buffers to perform the operations properly! 2643 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2644 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2645 is stored in the descriptor! What a messy API... */ 2646 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2647 /* compute the intermediate product of A * B */ 2648 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2649 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2650 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2651 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2652 /* get matrix C non-zero entries C_nnz1 */ 2653 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2654 c->nz = (PetscInt) C_nnz1; 2655 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2656 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2657 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2658 Ccsr->values = new THRUSTARRAY(c->nz); 2659 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2660 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2661 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2662 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2663 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2664 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2665 #endif 2666 #else 2667 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2668 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2669 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2670 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2671 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2672 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2673 c->nz = cnz; 2674 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2675 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2676 Ccsr->values = new THRUSTARRAY(c->nz); 2677 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2678 2679 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2680 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2681 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2682 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2683 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2684 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2685 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2686 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2687 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2688 #endif 2689 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2690 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2691 finalizesym: 2692 c->singlemalloc = PETSC_FALSE; 2693 c->free_a = PETSC_TRUE; 2694 c->free_ij = PETSC_TRUE; 2695 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2696 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2697 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2698 PetscInt *d_i = c->i; 2699 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2700 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2701 ii = *Ccsr->row_offsets; 2702 jj = *Ccsr->column_indices; 2703 if (ciscompressed) d_i = c->compressedrow.i; 2704 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2705 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2706 } else { 2707 PetscInt *d_i = c->i; 2708 if (ciscompressed) d_i = c->compressedrow.i; 2709 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2710 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2711 } 2712 if (ciscompressed) { /* need to expand host row offsets */ 2713 PetscInt r = 0; 2714 c->i[0] = 0; 2715 for (k = 0; k < c->compressedrow.nrows; k++) { 2716 const PetscInt next = c->compressedrow.rindex[k]; 2717 const PetscInt old = c->compressedrow.i[k]; 2718 for (; r < next; r++) c->i[r+1] = old; 2719 } 2720 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2721 } 2722 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2723 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2724 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2725 c->maxnz = c->nz; 2726 c->nonzerorowcnt = 0; 2727 c->rmax = 0; 2728 for (k = 0; k < m; k++) { 2729 const PetscInt nn = c->i[k+1] - c->i[k]; 2730 c->ilen[k] = c->imax[k] = nn; 2731 c->nonzerorowcnt += (PetscInt)!!nn; 2732 c->rmax = PetscMax(c->rmax,nn); 2733 } 2734 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2735 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2736 Ccsr->num_entries = c->nz; 2737 2738 C->nonzerostate++; 2739 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2740 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2741 Ccusp->nonzerostate = C->nonzerostate; 2742 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2743 C->preallocated = PETSC_TRUE; 2744 C->assembled = PETSC_FALSE; 2745 C->was_assembled = PETSC_FALSE; 2746 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2747 mmdata->reusesym = PETSC_TRUE; 2748 C->offloadmask = PETSC_OFFLOAD_GPU; 2749 } 2750 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2751 PetscFunctionReturn(0); 2752 } 2753 2754 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2755 2756 /* handles sparse or dense B */ 2757 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2758 { 2759 Mat_Product *product = mat->product; 2760 PetscErrorCode ierr; 2761 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2762 2763 PetscFunctionBegin; 2764 MatCheckProduct(mat,1); 2765 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2766 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2767 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2768 } 2769 if (product->type == MATPRODUCT_ABC) { 2770 Ciscusp = PETSC_FALSE; 2771 if (!product->C->boundtocpu) { 2772 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2773 } 2774 } 2775 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 2776 PetscBool usecpu = PETSC_FALSE; 2777 switch (product->type) { 2778 case MATPRODUCT_AB: 2779 if (product->api_user) { 2780 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 2781 ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2782 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2783 } else { 2784 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 2785 ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2786 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2787 } 2788 break; 2789 case MATPRODUCT_AtB: 2790 if (product->api_user) { 2791 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 2792 ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2793 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2794 } else { 2795 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 2796 ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2797 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2798 } 2799 break; 2800 case MATPRODUCT_PtAP: 2801 if (product->api_user) { 2802 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 2803 ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2804 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2805 } else { 2806 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 2807 ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2808 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2809 } 2810 break; 2811 case MATPRODUCT_RARt: 2812 if (product->api_user) { 2813 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 2814 ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2815 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2816 } else { 2817 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 2818 ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2819 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2820 } 2821 break; 2822 case MATPRODUCT_ABC: 2823 if (product->api_user) { 2824 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 2825 ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2826 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2827 } else { 2828 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 2829 ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2830 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2831 } 2832 break; 2833 default: 2834 break; 2835 } 2836 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 2837 } 2838 /* dispatch */ 2839 if (isdense) { 2840 switch (product->type) { 2841 case MATPRODUCT_AB: 2842 case MATPRODUCT_AtB: 2843 case MATPRODUCT_ABt: 2844 case MATPRODUCT_PtAP: 2845 case MATPRODUCT_RARt: 2846 if (product->A->boundtocpu) { 2847 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2848 } else { 2849 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2850 } 2851 break; 2852 case MATPRODUCT_ABC: 2853 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2854 break; 2855 default: 2856 break; 2857 } 2858 } else if (Biscusp && Ciscusp) { 2859 switch (product->type) { 2860 case MATPRODUCT_AB: 2861 case MATPRODUCT_AtB: 2862 case MATPRODUCT_ABt: 2863 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2864 break; 2865 case MATPRODUCT_PtAP: 2866 case MATPRODUCT_RARt: 2867 case MATPRODUCT_ABC: 2868 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2869 break; 2870 default: 2871 break; 2872 } 2873 } else { /* fallback for AIJ */ 2874 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2875 } 2876 PetscFunctionReturn(0); 2877 } 2878 2879 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2880 { 2881 PetscErrorCode ierr; 2882 2883 PetscFunctionBegin; 2884 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2885 PetscFunctionReturn(0); 2886 } 2887 2888 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2889 { 2890 PetscErrorCode ierr; 2891 2892 PetscFunctionBegin; 2893 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2894 PetscFunctionReturn(0); 2895 } 2896 2897 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2898 { 2899 PetscErrorCode ierr; 2900 2901 PetscFunctionBegin; 2902 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2903 PetscFunctionReturn(0); 2904 } 2905 2906 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2907 { 2908 PetscErrorCode ierr; 2909 2910 PetscFunctionBegin; 2911 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2912 PetscFunctionReturn(0); 2913 } 2914 2915 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2916 { 2917 PetscErrorCode ierr; 2918 2919 PetscFunctionBegin; 2920 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2921 PetscFunctionReturn(0); 2922 } 2923 2924 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2925 { 2926 int i = blockIdx.x*blockDim.x + threadIdx.x; 2927 if (i < n) y[idx[i]] += x[i]; 2928 } 2929 2930 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2931 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2932 { 2933 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2934 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2935 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2936 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2937 PetscErrorCode ierr; 2938 cusparseStatus_t stat; 2939 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2940 PetscBool compressed; 2941 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2942 PetscInt nx,ny; 2943 #endif 2944 2945 PetscFunctionBegin; 2946 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 2947 if (!a->nonzerorowcnt) { 2948 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2949 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2950 PetscFunctionReturn(0); 2951 } 2952 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2953 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2954 if (!trans) { 2955 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2956 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2957 } else { 2958 if (herm || !A->form_explicit_transpose) { 2959 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2960 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2961 } else { 2962 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);} 2963 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2964 } 2965 } 2966 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2967 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2968 2969 try { 2970 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2971 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2972 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2973 2974 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2975 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2976 /* z = A x + beta y. 2977 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2978 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2979 */ 2980 xptr = xarray; 2981 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2982 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2983 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2984 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2985 allocated to accommodate different uses. So we get the length info directly from mat. 2986 */ 2987 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2988 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2989 nx = mat->num_cols; 2990 ny = mat->num_rows; 2991 } 2992 #endif 2993 } else { 2994 /* z = A^T x + beta y 2995 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2996 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2997 */ 2998 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2999 dptr = zarray; 3000 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3001 if (compressed) { /* Scatter x to work vector */ 3002 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3003 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3004 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3005 VecCUDAEqualsReverse()); 3006 } 3007 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3008 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3009 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3010 nx = mat->num_rows; 3011 ny = mat->num_cols; 3012 } 3013 #endif 3014 } 3015 3016 /* csr_spmv does y = alpha op(A) x + beta y */ 3017 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3018 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3019 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3020 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3021 cudaError_t cerr; 3022 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3023 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3024 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3025 matstruct->matDescr, 3026 matstruct->cuSpMV[opA].vecXDescr, beta, 3027 matstruct->cuSpMV[opA].vecYDescr, 3028 cusparse_scalartype, 3029 cusparsestruct->spmvAlg, 3030 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 3031 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 3032 3033 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3034 } else { 3035 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3036 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 3037 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 3038 } 3039 3040 stat = cusparseSpMV(cusparsestruct->handle, opA, 3041 matstruct->alpha_one, 3042 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */ 3043 matstruct->cuSpMV[opA].vecXDescr, 3044 beta, 3045 matstruct->cuSpMV[opA].vecYDescr, 3046 cusparse_scalartype, 3047 cusparsestruct->spmvAlg, 3048 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 3049 #else 3050 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3051 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 3052 mat->num_rows, mat->num_cols, 3053 mat->num_entries, matstruct->alpha_one, matstruct->descr, 3054 mat->values->data().get(), mat->row_offsets->data().get(), 3055 mat->column_indices->data().get(), xptr, beta, 3056 dptr);CHKERRCUSPARSE(stat); 3057 #endif 3058 } else { 3059 if (cusparsestruct->nrows) { 3060 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3061 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3062 #else 3063 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3064 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 3065 matstruct->alpha_one, matstruct->descr, hybMat, 3066 xptr, beta, 3067 dptr);CHKERRCUSPARSE(stat); 3068 #endif 3069 } 3070 } 3071 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3072 3073 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3074 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3075 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3076 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 3077 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3078 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3079 } 3080 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3081 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 3082 } 3083 3084 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3085 if (compressed) { 3086 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3087 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3088 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3089 prevent that. So I just add a ScatterAdd kernel. 3090 */ 3091 #if 0 3092 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3093 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3094 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3095 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3096 VecCUDAPlusEquals()); 3097 #else 3098 PetscInt n = matstruct->cprowIndices->size(); 3099 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3100 #endif 3101 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3102 } 3103 } else { 3104 if (yy && yy != zz) { 3105 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3106 } 3107 } 3108 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3109 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 3110 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 3111 } catch(char *ex) { 3112 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3113 } 3114 if (yy) { 3115 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 3116 } else { 3117 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 3118 } 3119 PetscFunctionReturn(0); 3120 } 3121 3122 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3123 { 3124 PetscErrorCode ierr; 3125 3126 PetscFunctionBegin; 3127 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3128 PetscFunctionReturn(0); 3129 } 3130 3131 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3132 { 3133 PetscErrorCode ierr; 3134 PetscObjectState onnz = A->nonzerostate; 3135 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3136 3137 PetscFunctionBegin; 3138 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 3139 if (onnz != A->nonzerostate && cusp->deviceMat) { 3140 cudaError_t cerr; 3141 3142 ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 3143 cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr); 3144 cusp->deviceMat = NULL; 3145 } 3146 PetscFunctionReturn(0); 3147 } 3148 3149 /* --------------------------------------------------------------------------------*/ 3150 /*@ 3151 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3152 (the default parallel PETSc format). This matrix will ultimately pushed down 3153 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3154 assembly performance the user should preallocate the matrix storage by setting 3155 the parameter nz (or the array nnz). By setting these parameters accurately, 3156 performance during matrix assembly can be increased by more than a factor of 50. 3157 3158 Collective 3159 3160 Input Parameters: 3161 + comm - MPI communicator, set to PETSC_COMM_SELF 3162 . m - number of rows 3163 . n - number of columns 3164 . nz - number of nonzeros per row (same for all rows) 3165 - nnz - array containing the number of nonzeros in the various rows 3166 (possibly different for each row) or NULL 3167 3168 Output Parameter: 3169 . A - the matrix 3170 3171 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3172 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3173 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3174 3175 Notes: 3176 If nnz is given then nz is ignored 3177 3178 The AIJ format (also called the Yale sparse matrix format or 3179 compressed row storage), is fully compatible with standard Fortran 77 3180 storage. That is, the stored row and column indices can begin at 3181 either one (as in Fortran) or zero. See the users' manual for details. 3182 3183 Specify the preallocated storage with either nz or nnz (not both). 3184 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3185 allocation. For large problems you MUST preallocate memory or you 3186 will get TERRIBLE performance, see the users' manual chapter on matrices. 3187 3188 By default, this format uses inodes (identical nodes) when possible, to 3189 improve numerical efficiency of matrix-vector products and solves. We 3190 search for consecutive rows with the same nonzero structure, thereby 3191 reusing matrix information to achieve increased efficiency. 3192 3193 Level: intermediate 3194 3195 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3196 @*/ 3197 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3198 { 3199 PetscErrorCode ierr; 3200 3201 PetscFunctionBegin; 3202 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3203 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3204 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3205 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3206 PetscFunctionReturn(0); 3207 } 3208 3209 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3210 { 3211 PetscErrorCode ierr; 3212 3213 PetscFunctionBegin; 3214 if (A->factortype == MAT_FACTOR_NONE) { 3215 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3216 } else { 3217 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3218 } 3219 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3220 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3221 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3222 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3223 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3224 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3225 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3226 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3227 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 3228 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3229 PetscFunctionReturn(0); 3230 } 3231 3232 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3233 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3234 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3235 { 3236 PetscErrorCode ierr; 3237 3238 PetscFunctionBegin; 3239 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3240 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3241 PetscFunctionReturn(0); 3242 } 3243 3244 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3245 { 3246 PetscErrorCode ierr; 3247 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3248 Mat_SeqAIJCUSPARSE *cy; 3249 Mat_SeqAIJCUSPARSE *cx; 3250 PetscScalar *ay; 3251 const PetscScalar *ax; 3252 CsrMatrix *csry,*csrx; 3253 3254 PetscFunctionBegin; 3255 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3256 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3257 if (X->ops->axpy != Y->ops->axpy) { 3258 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3259 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3260 PetscFunctionReturn(0); 3261 } 3262 /* if we are here, it means both matrices are bound to GPU */ 3263 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3264 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3265 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3266 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3267 csry = (CsrMatrix*)cy->mat->mat; 3268 csrx = (CsrMatrix*)cx->mat->mat; 3269 /* see if we can turn this into a cublas axpy */ 3270 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3271 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3272 if (eq) { 3273 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3274 } 3275 if (eq) str = SAME_NONZERO_PATTERN; 3276 } 3277 /* spgeam is buggy with one column */ 3278 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3279 3280 if (str == SUBSET_NONZERO_PATTERN) { 3281 cusparseStatus_t stat; 3282 PetscScalar b = 1.0; 3283 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3284 size_t bufferSize; 3285 void *buffer; 3286 cudaError_t cerr; 3287 #endif 3288 3289 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3290 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3291 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3292 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3293 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3294 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3295 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3296 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3297 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3298 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3299 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3300 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3301 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3302 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3303 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3304 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3305 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3306 #else 3307 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3308 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3309 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3310 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3311 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3312 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3313 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3314 #endif 3315 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3316 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3317 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3318 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3319 } else if (str == SAME_NONZERO_PATTERN) { 3320 cublasHandle_t cublasv2handle; 3321 cublasStatus_t berr; 3322 PetscBLASInt one = 1, bnz = 1; 3323 3324 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3325 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3326 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3327 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3328 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3329 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3330 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3331 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3332 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3333 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3334 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3335 } else { 3336 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3337 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3338 } 3339 PetscFunctionReturn(0); 3340 } 3341 3342 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3343 { 3344 PetscErrorCode ierr; 3345 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3346 PetscScalar *ay; 3347 cublasHandle_t cublasv2handle; 3348 cublasStatus_t berr; 3349 PetscBLASInt one = 1, bnz = 1; 3350 3351 PetscFunctionBegin; 3352 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3353 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3354 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3355 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3356 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3357 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3358 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3359 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3360 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3361 PetscFunctionReturn(0); 3362 } 3363 3364 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3365 { 3366 PetscErrorCode ierr; 3367 PetscBool both = PETSC_FALSE; 3368 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3369 3370 PetscFunctionBegin; 3371 if (A->factortype == MAT_FACTOR_NONE) { 3372 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3373 if (spptr->mat) { 3374 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3375 if (matrix->values) { 3376 both = PETSC_TRUE; 3377 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3378 } 3379 } 3380 if (spptr->matTranspose) { 3381 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3382 if (matrix->values) { 3383 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3384 } 3385 } 3386 } 3387 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3388 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3389 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3390 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3391 else A->offloadmask = PETSC_OFFLOAD_CPU; 3392 PetscFunctionReturn(0); 3393 } 3394 3395 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3396 { 3397 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3398 PetscErrorCode ierr; 3399 3400 PetscFunctionBegin; 3401 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3402 if (flg) { 3403 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3404 3405 A->ops->scale = MatScale_SeqAIJ; 3406 A->ops->axpy = MatAXPY_SeqAIJ; 3407 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3408 A->ops->mult = MatMult_SeqAIJ; 3409 A->ops->multadd = MatMultAdd_SeqAIJ; 3410 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3411 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3412 A->ops->multhermitiantranspose = NULL; 3413 A->ops->multhermitiantransposeadd = NULL; 3414 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3415 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3416 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3417 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3418 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3419 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3420 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3421 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3422 } else { 3423 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3424 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3425 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3426 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3427 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3428 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3429 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3430 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3431 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3432 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3433 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3434 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3435 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3436 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3437 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3438 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3439 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3440 } 3441 A->boundtocpu = flg; 3442 a->inode.use = flg; 3443 PetscFunctionReturn(0); 3444 } 3445 3446 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3447 { 3448 PetscErrorCode ierr; 3449 cusparseStatus_t stat; 3450 Mat B; 3451 3452 PetscFunctionBegin; 3453 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3454 if (reuse == MAT_INITIAL_MATRIX) { 3455 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3456 } else if (reuse == MAT_REUSE_MATRIX) { 3457 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3458 } 3459 B = *newmat; 3460 3461 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3462 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3463 3464 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3465 if (B->factortype == MAT_FACTOR_NONE) { 3466 Mat_SeqAIJCUSPARSE *spptr; 3467 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3468 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3469 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3470 spptr->format = MAT_CUSPARSE_CSR; 3471 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3472 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3473 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3474 #else 3475 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3476 #endif 3477 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3478 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3479 #endif 3480 B->spptr = spptr; 3481 } else { 3482 Mat_SeqAIJCUSPARSETriFactors *spptr; 3483 3484 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3485 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3486 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3487 B->spptr = spptr; 3488 } 3489 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3490 } 3491 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3492 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3493 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3494 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3495 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3496 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3497 3498 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3499 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3500 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3501 #if defined(PETSC_HAVE_HYPRE) 3502 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 3503 #endif 3504 PetscFunctionReturn(0); 3505 } 3506 3507 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3508 { 3509 PetscErrorCode ierr; 3510 3511 PetscFunctionBegin; 3512 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3513 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3514 PetscFunctionReturn(0); 3515 } 3516 3517 /*MC 3518 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3519 3520 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3521 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3522 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3523 3524 Options Database Keys: 3525 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3526 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3527 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3528 3529 Level: beginner 3530 3531 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3532 M*/ 3533 3534 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3535 3536 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3537 { 3538 PetscErrorCode ierr; 3539 3540 PetscFunctionBegin; 3541 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 3542 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3543 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3544 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3545 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3546 3547 PetscFunctionReturn(0); 3548 } 3549 3550 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3551 { 3552 PetscErrorCode ierr; 3553 cusparseStatus_t stat; 3554 3555 PetscFunctionBegin; 3556 if (*cusparsestruct) { 3557 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3558 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3559 delete (*cusparsestruct)->workVector; 3560 delete (*cusparsestruct)->rowoffsets_gpu; 3561 delete (*cusparsestruct)->cooPerm; 3562 delete (*cusparsestruct)->cooPerm_a; 3563 delete (*cusparsestruct)->csr2csc_i; 3564 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3565 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3566 } 3567 PetscFunctionReturn(0); 3568 } 3569 3570 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3571 { 3572 PetscFunctionBegin; 3573 if (*mat) { 3574 delete (*mat)->values; 3575 delete (*mat)->column_indices; 3576 delete (*mat)->row_offsets; 3577 delete *mat; 3578 *mat = 0; 3579 } 3580 PetscFunctionReturn(0); 3581 } 3582 3583 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3584 { 3585 cusparseStatus_t stat; 3586 PetscErrorCode ierr; 3587 3588 PetscFunctionBegin; 3589 if (*trifactor) { 3590 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3591 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3592 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3593 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3594 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3595 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3596 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3597 #endif 3598 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3599 } 3600 PetscFunctionReturn(0); 3601 } 3602 3603 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3604 { 3605 CsrMatrix *mat; 3606 cusparseStatus_t stat; 3607 cudaError_t err; 3608 3609 PetscFunctionBegin; 3610 if (*matstruct) { 3611 if ((*matstruct)->mat) { 3612 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3613 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3614 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3615 #else 3616 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3617 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3618 #endif 3619 } else { 3620 mat = (CsrMatrix*)(*matstruct)->mat; 3621 CsrMatrix_Destroy(&mat); 3622 } 3623 } 3624 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3625 delete (*matstruct)->cprowIndices; 3626 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3627 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3628 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3629 3630 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3631 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3632 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3633 for (int i=0; i<3; i++) { 3634 if (mdata->cuSpMV[i].initialized) { 3635 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3636 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3637 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3638 } 3639 } 3640 #endif 3641 delete *matstruct; 3642 *matstruct = NULL; 3643 } 3644 PetscFunctionReturn(0); 3645 } 3646 3647 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3648 { 3649 PetscErrorCode ierr; 3650 3651 PetscFunctionBegin; 3652 if (*trifactors) { 3653 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3654 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3655 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3656 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3657 delete (*trifactors)->rpermIndices; 3658 delete (*trifactors)->cpermIndices; 3659 delete (*trifactors)->workVector; 3660 (*trifactors)->rpermIndices = NULL; 3661 (*trifactors)->cpermIndices = NULL; 3662 (*trifactors)->workVector = NULL; 3663 if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3664 if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3665 (*trifactors)->init_dev_prop = PETSC_FALSE; 3666 } 3667 PetscFunctionReturn(0); 3668 } 3669 3670 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3671 { 3672 PetscErrorCode ierr; 3673 cusparseHandle_t handle; 3674 cusparseStatus_t stat; 3675 3676 PetscFunctionBegin; 3677 if (*trifactors) { 3678 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3679 if (handle = (*trifactors)->handle) { 3680 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3681 } 3682 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3683 } 3684 PetscFunctionReturn(0); 3685 } 3686 3687 struct IJCompare 3688 { 3689 __host__ __device__ 3690 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3691 { 3692 if (t1.get<0>() < t2.get<0>()) return true; 3693 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3694 return false; 3695 } 3696 }; 3697 3698 struct IJEqual 3699 { 3700 __host__ __device__ 3701 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3702 { 3703 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3704 return true; 3705 } 3706 }; 3707 3708 struct IJDiff 3709 { 3710 __host__ __device__ 3711 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3712 { 3713 return t1 == t2 ? 0 : 1; 3714 } 3715 }; 3716 3717 struct IJSum 3718 { 3719 __host__ __device__ 3720 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3721 { 3722 return t1||t2; 3723 } 3724 }; 3725 3726 #include <thrust/iterator/discard_iterator.h> 3727 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3728 { 3729 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3730 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3731 THRUSTARRAY *cooPerm_v = NULL; 3732 thrust::device_ptr<const PetscScalar> d_v; 3733 CsrMatrix *matrix; 3734 PetscErrorCode ierr; 3735 PetscInt n; 3736 3737 PetscFunctionBegin; 3738 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3739 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3740 if (!cusp->cooPerm) { 3741 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3742 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3743 PetscFunctionReturn(0); 3744 } 3745 matrix = (CsrMatrix*)cusp->mat->mat; 3746 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3747 if (!v) { 3748 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3749 goto finalize; 3750 } 3751 n = cusp->cooPerm->size(); 3752 if (isCudaMem(v)) { 3753 d_v = thrust::device_pointer_cast(v); 3754 } else { 3755 cooPerm_v = new THRUSTARRAY(n); 3756 cooPerm_v->assign(v,v+n); 3757 d_v = cooPerm_v->data(); 3758 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3759 } 3760 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3761 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3762 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3763 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3764 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3765 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3766 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3767 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3768 */ 3769 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3770 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3771 delete cooPerm_w; 3772 } else { 3773 /* all nonzeros in d_v[] are unique entries */ 3774 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3775 matrix->values->begin())); 3776 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3777 matrix->values->end())); 3778 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 3779 } 3780 } else { 3781 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3782 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3783 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3784 } else { 3785 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3786 matrix->values->begin())); 3787 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3788 matrix->values->end())); 3789 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3790 } 3791 } 3792 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3793 finalize: 3794 delete cooPerm_v; 3795 A->offloadmask = PETSC_OFFLOAD_GPU; 3796 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3797 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3798 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3799 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3800 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3801 a->reallocs = 0; 3802 A->info.mallocs += 0; 3803 A->info.nz_unneeded = 0; 3804 A->assembled = A->was_assembled = PETSC_TRUE; 3805 A->num_ass++; 3806 PetscFunctionReturn(0); 3807 } 3808 3809 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3810 { 3811 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3812 PetscErrorCode ierr; 3813 3814 PetscFunctionBegin; 3815 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3816 if (!cusp) PetscFunctionReturn(0); 3817 if (destroy) { 3818 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3819 delete cusp->csr2csc_i; 3820 cusp->csr2csc_i = NULL; 3821 } 3822 A->transupdated = PETSC_FALSE; 3823 PetscFunctionReturn(0); 3824 } 3825 3826 #include <thrust/binary_search.h> 3827 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3828 { 3829 PetscErrorCode ierr; 3830 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3831 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3832 PetscInt cooPerm_n, nzr = 0; 3833 cudaError_t cerr; 3834 3835 PetscFunctionBegin; 3836 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3837 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3838 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3839 if (n != cooPerm_n) { 3840 delete cusp->cooPerm; 3841 delete cusp->cooPerm_a; 3842 cusp->cooPerm = NULL; 3843 cusp->cooPerm_a = NULL; 3844 } 3845 if (n) { 3846 THRUSTINTARRAY d_i(n); 3847 THRUSTINTARRAY d_j(n); 3848 THRUSTINTARRAY ii(A->rmap->n); 3849 3850 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3851 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3852 3853 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3854 d_i.assign(coo_i,coo_i+n); 3855 d_j.assign(coo_j,coo_j+n); 3856 3857 /* Ex. 3858 n = 6 3859 coo_i = [3,3,1,4,1,4] 3860 coo_j = [3,2,2,5,2,6] 3861 */ 3862 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3863 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3864 3865 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3866 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3867 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 3868 *cusp->cooPerm_a = d_i; /* copy the sorted array */ 3869 THRUSTINTARRAY w = d_j; 3870 3871 /* 3872 d_i = [1,1,3,3,4,4] 3873 d_j = [2,2,2,3,5,6] 3874 cooPerm = [2,4,1,0,3,5] 3875 */ 3876 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 3877 3878 /* 3879 d_i = [1,3,3,4,4,x] 3880 ^ekey 3881 d_j = [2,2,3,5,6,x] 3882 ^nekye 3883 */ 3884 if (nekey == ekey) { /* all entries are unique */ 3885 delete cusp->cooPerm_a; 3886 cusp->cooPerm_a = NULL; 3887 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 3888 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 3889 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 3890 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 3891 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 3892 w[0] = 0; 3893 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 3894 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 3895 } 3896 thrust::counting_iterator<PetscInt> search_begin(0); 3897 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 3898 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 3899 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 3900 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3901 3902 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3903 a->singlemalloc = PETSC_FALSE; 3904 a->free_a = PETSC_TRUE; 3905 a->free_ij = PETSC_TRUE; 3906 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3907 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 3908 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3909 a->nz = a->maxnz = a->i[A->rmap->n]; 3910 a->rmax = 0; 3911 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3912 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3913 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3914 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3915 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3916 for (PetscInt i = 0; i < A->rmap->n; i++) { 3917 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3918 nzr += (PetscInt)!!(nnzr); 3919 a->ilen[i] = a->imax[i] = nnzr; 3920 a->rmax = PetscMax(a->rmax,nnzr); 3921 } 3922 a->nonzerorowcnt = nzr; 3923 A->preallocated = PETSC_TRUE; 3924 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3925 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3926 } else { 3927 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3928 } 3929 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3930 3931 /* We want to allocate the CUSPARSE struct for matvec now. 3932 The code is so convoluted now that I prefer to copy zeros */ 3933 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3934 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3935 A->offloadmask = PETSC_OFFLOAD_CPU; 3936 A->nonzerostate++; 3937 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3938 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 3939 3940 A->assembled = PETSC_FALSE; 3941 A->was_assembled = PETSC_FALSE; 3942 PetscFunctionReturn(0); 3943 } 3944 3945 /*@C 3946 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 3947 3948 Not collective 3949 3950 Input Parameters: 3951 + A - the matrix 3952 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 3953 3954 Output Parameters: 3955 + ia - the CSR row pointers 3956 - ja - the CSR column indices 3957 3958 Level: developer 3959 3960 Notes: 3961 When compressed is true, the CSR structure does not contain empty rows 3962 3963 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead() 3964 @*/ 3965 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 3966 { 3967 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3968 CsrMatrix *csr; 3969 PetscErrorCode ierr; 3970 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3971 3972 PetscFunctionBegin; 3973 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3974 if (!i || !j) PetscFunctionReturn(0); 3975 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3976 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3977 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3978 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3979 csr = (CsrMatrix*)cusp->mat->mat; 3980 if (i) { 3981 if (!compressed && a->compressedrow.use) { /* need full row offset */ 3982 if (!cusp->rowoffsets_gpu) { 3983 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 3984 cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 3985 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3986 } 3987 *i = cusp->rowoffsets_gpu->data().get(); 3988 } else *i = csr->row_offsets->data().get(); 3989 } 3990 if (j) *j = csr->column_indices->data().get(); 3991 PetscFunctionReturn(0); 3992 } 3993 3994 /*@C 3995 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 3996 3997 Not collective 3998 3999 Input Parameters: 4000 + A - the matrix 4001 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4002 4003 Output Parameters: 4004 + ia - the CSR row pointers 4005 - ja - the CSR column indices 4006 4007 Level: developer 4008 4009 .seealso: MatSeqAIJCUSPARSEGetIJ() 4010 @*/ 4011 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4012 { 4013 PetscFunctionBegin; 4014 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4015 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4016 if (i) *i = NULL; 4017 if (j) *j = NULL; 4018 PetscFunctionReturn(0); 4019 } 4020 4021 /*@C 4022 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4023 4024 Not Collective 4025 4026 Input Parameter: 4027 . A - a MATSEQAIJCUSPARSE matrix 4028 4029 Output Parameter: 4030 . a - pointer to the device data 4031 4032 Level: developer 4033 4034 Notes: may trigger host-device copies if up-to-date matrix data is on host 4035 4036 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead() 4037 @*/ 4038 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4039 { 4040 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4041 CsrMatrix *csr; 4042 PetscErrorCode ierr; 4043 4044 PetscFunctionBegin; 4045 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4046 PetscValidPointer(a,2); 4047 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4048 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4049 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4050 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4051 csr = (CsrMatrix*)cusp->mat->mat; 4052 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4053 *a = csr->values->data().get(); 4054 PetscFunctionReturn(0); 4055 } 4056 4057 /*@C 4058 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 4059 4060 Not Collective 4061 4062 Input Parameter: 4063 . A - a MATSEQAIJCUSPARSE matrix 4064 4065 Output Parameter: 4066 . a - pointer to the device data 4067 4068 Level: developer 4069 4070 .seealso: MatSeqAIJCUSPARSEGetArrayRead() 4071 @*/ 4072 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4073 { 4074 PetscFunctionBegin; 4075 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4076 PetscValidPointer(a,2); 4077 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4078 *a = NULL; 4079 PetscFunctionReturn(0); 4080 } 4081 4082 /*@C 4083 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4084 4085 Not Collective 4086 4087 Input Parameter: 4088 . A - a MATSEQAIJCUSPARSE matrix 4089 4090 Output Parameter: 4091 . a - pointer to the device data 4092 4093 Level: developer 4094 4095 Notes: may trigger host-device copies if up-to-date matrix data is on host 4096 4097 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray() 4098 @*/ 4099 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4100 { 4101 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4102 CsrMatrix *csr; 4103 PetscErrorCode ierr; 4104 4105 PetscFunctionBegin; 4106 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4107 PetscValidPointer(a,2); 4108 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4109 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4110 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4111 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4112 csr = (CsrMatrix*)cusp->mat->mat; 4113 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4114 *a = csr->values->data().get(); 4115 A->offloadmask = PETSC_OFFLOAD_GPU; 4116 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4117 PetscFunctionReturn(0); 4118 } 4119 /*@C 4120 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4121 4122 Not Collective 4123 4124 Input Parameter: 4125 . A - a MATSEQAIJCUSPARSE matrix 4126 4127 Output Parameter: 4128 . a - pointer to the device data 4129 4130 Level: developer 4131 4132 .seealso: MatSeqAIJCUSPARSEGetArray() 4133 @*/ 4134 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4135 { 4136 PetscErrorCode ierr; 4137 4138 PetscFunctionBegin; 4139 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4140 PetscValidPointer(a,2); 4141 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4142 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4143 *a = NULL; 4144 PetscFunctionReturn(0); 4145 } 4146 4147 /*@C 4148 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4149 4150 Not Collective 4151 4152 Input Parameter: 4153 . A - a MATSEQAIJCUSPARSE matrix 4154 4155 Output Parameter: 4156 . a - pointer to the device data 4157 4158 Level: developer 4159 4160 Notes: does not trigger host-device copies and flags data validity on the GPU 4161 4162 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite() 4163 @*/ 4164 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4165 { 4166 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4167 CsrMatrix *csr; 4168 PetscErrorCode ierr; 4169 4170 PetscFunctionBegin; 4171 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4172 PetscValidPointer(a,2); 4173 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4174 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4175 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4176 csr = (CsrMatrix*)cusp->mat->mat; 4177 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4178 *a = csr->values->data().get(); 4179 A->offloadmask = PETSC_OFFLOAD_GPU; 4180 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4181 PetscFunctionReturn(0); 4182 } 4183 4184 /*@C 4185 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 4186 4187 Not Collective 4188 4189 Input Parameter: 4190 . A - a MATSEQAIJCUSPARSE matrix 4191 4192 Output Parameter: 4193 . a - pointer to the device data 4194 4195 Level: developer 4196 4197 .seealso: MatSeqAIJCUSPARSEGetArrayWrite() 4198 @*/ 4199 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4200 { 4201 PetscErrorCode ierr; 4202 4203 PetscFunctionBegin; 4204 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4205 PetscValidPointer(a,2); 4206 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4207 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4208 *a = NULL; 4209 PetscFunctionReturn(0); 4210 } 4211 4212 struct IJCompare4 4213 { 4214 __host__ __device__ 4215 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4216 { 4217 if (t1.get<0>() < t2.get<0>()) return true; 4218 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4219 return false; 4220 } 4221 }; 4222 4223 struct Shift 4224 { 4225 int _shift; 4226 4227 Shift(int shift) : _shift(shift) {} 4228 __host__ __device__ 4229 inline int operator() (const int &c) 4230 { 4231 return c + _shift; 4232 } 4233 }; 4234 4235 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4236 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4237 { 4238 PetscErrorCode ierr; 4239 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4240 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4241 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4242 CsrMatrix *Acsr,*Bcsr,*Ccsr; 4243 PetscInt Annz,Bnnz; 4244 cusparseStatus_t stat; 4245 PetscInt i,m,n,zero = 0; 4246 cudaError_t cerr; 4247 4248 PetscFunctionBegin; 4249 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4250 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4251 PetscValidPointer(C,4); 4252 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4253 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 4254 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 4255 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 4256 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4257 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4258 if (reuse == MAT_INITIAL_MATRIX) { 4259 m = A->rmap->n; 4260 n = A->cmap->n + B->cmap->n; 4261 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 4262 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 4263 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 4264 c = (Mat_SeqAIJ*)(*C)->data; 4265 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4266 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4267 Ccsr = new CsrMatrix; 4268 Cmat->cprowIndices = NULL; 4269 c->compressedrow.use = PETSC_FALSE; 4270 c->compressedrow.nrows = 0; 4271 c->compressedrow.i = NULL; 4272 c->compressedrow.rindex = NULL; 4273 Ccusp->workVector = NULL; 4274 Ccusp->nrows = m; 4275 Ccusp->mat = Cmat; 4276 Ccusp->mat->mat = Ccsr; 4277 Ccsr->num_rows = m; 4278 Ccsr->num_cols = n; 4279 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 4280 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4281 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4282 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4283 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4284 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4285 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4286 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4287 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4288 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4289 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4290 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 4291 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 4292 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4293 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4294 4295 Acsr = (CsrMatrix*)Acusp->mat->mat; 4296 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4297 Annz = (PetscInt)Acsr->column_indices->size(); 4298 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4299 c->nz = Annz + Bnnz; 4300 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4301 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4302 Ccsr->values = new THRUSTARRAY(c->nz); 4303 Ccsr->num_entries = c->nz; 4304 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4305 if (c->nz) { 4306 auto Acoo = new THRUSTINTARRAY32(Annz); 4307 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4308 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4309 THRUSTINTARRAY32 *Aroff,*Broff; 4310 4311 if (a->compressedrow.use) { /* need full row offset */ 4312 if (!Acusp->rowoffsets_gpu) { 4313 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4314 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4315 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4316 } 4317 Aroff = Acusp->rowoffsets_gpu; 4318 } else Aroff = Acsr->row_offsets; 4319 if (b->compressedrow.use) { /* need full row offset */ 4320 if (!Bcusp->rowoffsets_gpu) { 4321 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4322 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4323 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4324 } 4325 Broff = Bcusp->rowoffsets_gpu; 4326 } else Broff = Bcsr->row_offsets; 4327 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4328 stat = cusparseXcsr2coo(Acusp->handle, 4329 Aroff->data().get(), 4330 Annz, 4331 m, 4332 Acoo->data().get(), 4333 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4334 stat = cusparseXcsr2coo(Bcusp->handle, 4335 Broff->data().get(), 4336 Bnnz, 4337 m, 4338 Bcoo->data().get(), 4339 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4340 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4341 auto Aperm = thrust::make_constant_iterator(1); 4342 auto Bperm = thrust::make_constant_iterator(0); 4343 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4344 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4345 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4346 #else 4347 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4348 auto Bcib = Bcsr->column_indices->begin(); 4349 auto Bcie = Bcsr->column_indices->end(); 4350 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4351 #endif 4352 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4353 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4354 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4355 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4356 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4357 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4358 auto p1 = Ccusp->cooPerm->begin(); 4359 auto p2 = Ccusp->cooPerm->begin(); 4360 thrust::advance(p2,Annz); 4361 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4362 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4363 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4364 #endif 4365 auto cci = thrust::make_counting_iterator(zero); 4366 auto cce = thrust::make_counting_iterator(c->nz); 4367 #if 0 //Errors on SUMMIT cuda 11.1.0 4368 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4369 #else 4370 auto pred = thrust::identity<int>(); 4371 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4372 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4373 #endif 4374 stat = cusparseXcoo2csr(Ccusp->handle, 4375 Ccoo->data().get(), 4376 c->nz, 4377 m, 4378 Ccsr->row_offsets->data().get(), 4379 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4380 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4381 delete wPerm; 4382 delete Acoo; 4383 delete Bcoo; 4384 delete Ccoo; 4385 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4386 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4387 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4388 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4389 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4390 #endif 4391 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4392 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4393 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4394 CsrMatrix *CcsrT = new CsrMatrix; 4395 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4396 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4397 4398 (*C)->form_explicit_transpose = PETSC_TRUE; 4399 (*C)->transupdated = PETSC_TRUE; 4400 Ccusp->rowoffsets_gpu = NULL; 4401 CmatT->cprowIndices = NULL; 4402 CmatT->mat = CcsrT; 4403 CcsrT->num_rows = n; 4404 CcsrT->num_cols = m; 4405 CcsrT->num_entries = c->nz; 4406 4407 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4408 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4409 CcsrT->values = new THRUSTARRAY(c->nz); 4410 4411 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4412 auto rT = CcsrT->row_offsets->begin(); 4413 if (AT) { 4414 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4415 thrust::advance(rT,-1); 4416 } 4417 if (BT) { 4418 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4419 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4420 thrust::copy(titb,tite,rT); 4421 } 4422 auto cT = CcsrT->column_indices->begin(); 4423 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4424 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4425 auto vT = CcsrT->values->begin(); 4426 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4427 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4428 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4429 4430 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4431 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4432 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4433 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4434 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4435 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4436 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4437 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4438 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4439 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4440 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4441 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4442 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4443 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4444 #endif 4445 Ccusp->matTranspose = CmatT; 4446 } 4447 } 4448 4449 c->singlemalloc = PETSC_FALSE; 4450 c->free_a = PETSC_TRUE; 4451 c->free_ij = PETSC_TRUE; 4452 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4453 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4454 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4455 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4456 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4457 ii = *Ccsr->row_offsets; 4458 jj = *Ccsr->column_indices; 4459 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4460 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4461 } else { 4462 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4463 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4464 } 4465 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4466 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4467 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4468 c->maxnz = c->nz; 4469 c->nonzerorowcnt = 0; 4470 c->rmax = 0; 4471 for (i = 0; i < m; i++) { 4472 const PetscInt nn = c->i[i+1] - c->i[i]; 4473 c->ilen[i] = c->imax[i] = nn; 4474 c->nonzerorowcnt += (PetscInt)!!nn; 4475 c->rmax = PetscMax(c->rmax,nn); 4476 } 4477 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4478 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4479 (*C)->nonzerostate++; 4480 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4481 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4482 Ccusp->nonzerostate = (*C)->nonzerostate; 4483 (*C)->preallocated = PETSC_TRUE; 4484 } else { 4485 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4486 c = (Mat_SeqAIJ*)(*C)->data; 4487 if (c->nz) { 4488 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4489 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4490 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4491 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4492 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4493 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4494 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4495 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4496 Acsr = (CsrMatrix*)Acusp->mat->mat; 4497 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4498 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4499 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4500 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4501 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4502 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4503 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4504 auto pmid = Ccusp->cooPerm->begin(); 4505 thrust::advance(pmid,Acsr->num_entries); 4506 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4507 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4508 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4509 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4510 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4511 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4512 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4513 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4514 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4515 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4516 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4517 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4518 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4519 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4520 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4521 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4522 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4523 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4524 auto vT = CcsrT->values->begin(); 4525 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4526 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4527 (*C)->transupdated = PETSC_TRUE; 4528 } 4529 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4530 } 4531 } 4532 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4533 (*C)->assembled = PETSC_TRUE; 4534 (*C)->was_assembled = PETSC_FALSE; 4535 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4536 PetscFunctionReturn(0); 4537 } 4538 4539 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4540 { 4541 PetscErrorCode ierr; 4542 bool dmem; 4543 const PetscScalar *av; 4544 cudaError_t cerr; 4545 4546 PetscFunctionBegin; 4547 dmem = isCudaMem(v); 4548 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4549 if (n && idx) { 4550 THRUSTINTARRAY widx(n); 4551 widx.assign(idx,idx+n); 4552 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4553 4554 THRUSTARRAY *w = NULL; 4555 thrust::device_ptr<PetscScalar> dv; 4556 if (dmem) { 4557 dv = thrust::device_pointer_cast(v); 4558 } else { 4559 w = new THRUSTARRAY(n); 4560 dv = w->data(); 4561 } 4562 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4563 4564 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4565 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4566 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4567 if (w) { 4568 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4569 } 4570 delete w; 4571 } else { 4572 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4573 } 4574 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4575 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4576 PetscFunctionReturn(0); 4577 } 4578