1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/async/for_each.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 76 77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 82 83 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 85 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 86 87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 89 90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 91 92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 93 { 94 cusparseStatus_t stat; 95 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 96 97 PetscFunctionBegin; 98 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 99 cusparsestruct->stream = stream; 100 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 101 PetscFunctionReturn(0); 102 } 103 104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 105 { 106 cusparseStatus_t stat; 107 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 108 109 PetscFunctionBegin; 110 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 111 if (cusparsestruct->handle != handle) { 112 if (cusparsestruct->handle) { 113 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 114 } 115 cusparsestruct->handle = handle; 116 } 117 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 118 PetscFunctionReturn(0); 119 } 120 121 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 122 { 123 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 124 PetscBool flg; 125 PetscErrorCode ierr; 126 127 PetscFunctionBegin; 128 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 129 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 130 if (cusparsestruct->handle) cusparsestruct->handle = 0; 131 PetscFunctionReturn(0); 132 } 133 134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 135 { 136 PetscFunctionBegin; 137 *type = MATSOLVERCUSPARSE; 138 PetscFunctionReturn(0); 139 } 140 141 /*MC 142 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 143 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 144 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 145 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 146 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 147 algorithms are not recommended. This class does NOT support direct solver operations. 148 149 Level: beginner 150 151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 152 M*/ 153 154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 155 { 156 PetscErrorCode ierr; 157 PetscInt n = A->rmap->n; 158 159 PetscFunctionBegin; 160 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 161 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 162 (*B)->factortype = ftype; 163 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 164 165 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 166 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 167 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 168 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 169 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 170 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 171 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 172 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 173 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 174 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 175 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 176 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 177 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 178 179 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 180 (*B)->canuseordering = PETSC_TRUE; 181 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 182 PetscFunctionReturn(0); 183 } 184 185 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 186 { 187 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 188 189 PetscFunctionBegin; 190 switch (op) { 191 case MAT_CUSPARSE_MULT: 192 cusparsestruct->format = format; 193 break; 194 case MAT_CUSPARSE_ALL: 195 cusparsestruct->format = format; 196 break; 197 default: 198 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 199 } 200 PetscFunctionReturn(0); 201 } 202 203 /*@ 204 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 205 operation. Only the MatMult operation can use different GPU storage formats 206 for MPIAIJCUSPARSE matrices. 207 Not Collective 208 209 Input Parameters: 210 + A - Matrix of type SEQAIJCUSPARSE 211 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 212 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 213 214 Output Parameter: 215 216 Level: intermediate 217 218 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 219 @*/ 220 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 221 { 222 PetscErrorCode ierr; 223 224 PetscFunctionBegin; 225 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 226 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 227 PetscFunctionReturn(0); 228 } 229 230 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 231 { 232 PetscErrorCode ierr; 233 234 PetscFunctionBegin; 235 switch (op) { 236 case MAT_FORM_EXPLICIT_TRANSPOSE: 237 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 238 if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 239 A->form_explicit_transpose = flg; 240 break; 241 default: 242 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 243 break; 244 } 245 PetscFunctionReturn(0); 246 } 247 248 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 249 250 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 251 { 252 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 253 IS isrow = b->row,iscol = b->col; 254 PetscBool row_identity,col_identity; 255 PetscErrorCode ierr; 256 257 PetscFunctionBegin; 258 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 259 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 260 B->offloadmask = PETSC_OFFLOAD_CPU; 261 /* determine which version of MatSolve needs to be used. */ 262 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 263 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 264 if (row_identity && col_identity) { 265 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 266 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 267 B->ops->matsolve = NULL; 268 B->ops->matsolvetranspose = NULL; 269 } else { 270 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 271 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 272 B->ops->matsolve = NULL; 273 B->ops->matsolvetranspose = NULL; 274 } 275 276 /* get the triangular factors */ 277 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 278 PetscFunctionReturn(0); 279 } 280 281 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 282 { 283 PetscErrorCode ierr; 284 MatCUSPARSEStorageFormat format; 285 PetscBool flg; 286 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 287 288 PetscFunctionBegin; 289 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 290 if (A->factortype == MAT_FACTOR_NONE) { 291 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 292 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 293 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 294 295 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 296 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 297 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 298 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 299 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 300 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 301 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 302 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 303 304 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 305 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 306 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 307 308 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 309 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 310 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 311 #endif 312 } 313 ierr = PetscOptionsTail();CHKERRQ(ierr); 314 PetscFunctionReturn(0); 315 } 316 317 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 318 { 319 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 320 PetscErrorCode ierr; 321 322 PetscFunctionBegin; 323 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 324 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 325 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 326 PetscFunctionReturn(0); 327 } 328 329 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 330 { 331 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 332 PetscErrorCode ierr; 333 334 PetscFunctionBegin; 335 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 336 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 337 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 338 PetscFunctionReturn(0); 339 } 340 341 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 342 { 343 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 344 PetscErrorCode ierr; 345 346 PetscFunctionBegin; 347 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 348 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 349 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 350 PetscFunctionReturn(0); 351 } 352 353 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 354 { 355 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 356 PetscErrorCode ierr; 357 358 PetscFunctionBegin; 359 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 360 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 361 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 362 PetscFunctionReturn(0); 363 } 364 365 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 366 { 367 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 368 PetscInt n = A->rmap->n; 369 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 370 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 371 cusparseStatus_t stat; 372 const PetscInt *ai = a->i,*aj = a->j,*vi; 373 const MatScalar *aa = a->a,*v; 374 PetscInt *AiLo, *AjLo; 375 PetscInt i,nz, nzLower, offset, rowOffset; 376 PetscErrorCode ierr; 377 cudaError_t cerr; 378 379 PetscFunctionBegin; 380 if (!n) PetscFunctionReturn(0); 381 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 382 try { 383 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 384 nzLower=n+ai[n]-ai[1]; 385 if (!loTriFactor) { 386 PetscScalar *AALo; 387 388 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 389 390 /* Allocate Space for the lower triangular matrix */ 391 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 392 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 393 394 /* Fill the lower triangular matrix */ 395 AiLo[0] = (PetscInt) 0; 396 AiLo[n] = nzLower; 397 AjLo[0] = (PetscInt) 0; 398 AALo[0] = (MatScalar) 1.0; 399 v = aa; 400 vi = aj; 401 offset = 1; 402 rowOffset= 1; 403 for (i=1; i<n; i++) { 404 nz = ai[i+1] - ai[i]; 405 /* additional 1 for the term on the diagonal */ 406 AiLo[i] = rowOffset; 407 rowOffset += nz+1; 408 409 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 410 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 411 412 offset += nz; 413 AjLo[offset] = (PetscInt) i; 414 AALo[offset] = (MatScalar) 1.0; 415 offset += 1; 416 417 v += nz; 418 vi += nz; 419 } 420 421 /* allocate space for the triangular factor information */ 422 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 423 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 424 /* Create the matrix description */ 425 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 426 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 427 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 428 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 429 #else 430 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 431 #endif 432 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 433 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 434 435 /* set the operation */ 436 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 437 438 /* set the matrix */ 439 loTriFactor->csrMat = new CsrMatrix; 440 loTriFactor->csrMat->num_rows = n; 441 loTriFactor->csrMat->num_cols = n; 442 loTriFactor->csrMat->num_entries = nzLower; 443 444 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 445 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 446 447 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 448 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 449 450 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 451 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 452 453 /* Create the solve analysis information */ 454 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 455 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 456 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 457 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 458 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 459 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 460 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 461 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 462 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 463 #endif 464 465 /* perform the solve analysis */ 466 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 467 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 468 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 469 loTriFactor->csrMat->column_indices->data().get(), 470 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 471 loTriFactor->solveInfo, 472 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 473 #else 474 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 475 #endif 476 cerr = WaitForCUDA();CHKERRCUDA(cerr); 477 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 478 479 /* assign the pointer */ 480 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 481 loTriFactor->AA_h = AALo; 482 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 483 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 484 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 485 } else { /* update values only */ 486 if (!loTriFactor->AA_h) { 487 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 488 } 489 /* Fill the lower triangular matrix */ 490 loTriFactor->AA_h[0] = 1.0; 491 v = aa; 492 vi = aj; 493 offset = 1; 494 for (i=1; i<n; i++) { 495 nz = ai[i+1] - ai[i]; 496 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 497 offset += nz; 498 loTriFactor->AA_h[offset] = 1.0; 499 offset += 1; 500 v += nz; 501 } 502 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 503 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 504 } 505 } catch(char *ex) { 506 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 507 } 508 } 509 PetscFunctionReturn(0); 510 } 511 512 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 513 { 514 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 515 PetscInt n = A->rmap->n; 516 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 517 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 518 cusparseStatus_t stat; 519 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 520 const MatScalar *aa = a->a,*v; 521 PetscInt *AiUp, *AjUp; 522 PetscInt i,nz, nzUpper, offset; 523 PetscErrorCode ierr; 524 cudaError_t cerr; 525 526 PetscFunctionBegin; 527 if (!n) PetscFunctionReturn(0); 528 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 529 try { 530 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 531 nzUpper = adiag[0]-adiag[n]; 532 if (!upTriFactor) { 533 PetscScalar *AAUp; 534 535 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 536 537 /* Allocate Space for the upper triangular matrix */ 538 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 539 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 540 541 /* Fill the upper triangular matrix */ 542 AiUp[0]=(PetscInt) 0; 543 AiUp[n]=nzUpper; 544 offset = nzUpper; 545 for (i=n-1; i>=0; i--) { 546 v = aa + adiag[i+1] + 1; 547 vi = aj + adiag[i+1] + 1; 548 549 /* number of elements NOT on the diagonal */ 550 nz = adiag[i] - adiag[i+1]-1; 551 552 /* decrement the offset */ 553 offset -= (nz+1); 554 555 /* first, set the diagonal elements */ 556 AjUp[offset] = (PetscInt) i; 557 AAUp[offset] = (MatScalar)1./v[nz]; 558 AiUp[i] = AiUp[i+1] - (nz+1); 559 560 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 561 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 562 } 563 564 /* allocate space for the triangular factor information */ 565 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 566 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 567 568 /* Create the matrix description */ 569 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 570 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 571 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 572 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 573 #else 574 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 575 #endif 576 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 577 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 578 579 /* set the operation */ 580 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 581 582 /* set the matrix */ 583 upTriFactor->csrMat = new CsrMatrix; 584 upTriFactor->csrMat->num_rows = n; 585 upTriFactor->csrMat->num_cols = n; 586 upTriFactor->csrMat->num_entries = nzUpper; 587 588 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 589 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 590 591 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 592 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 593 594 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 595 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 596 597 /* Create the solve analysis information */ 598 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 599 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 600 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 601 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 602 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 603 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 604 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 605 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 606 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 607 #endif 608 609 /* perform the solve analysis */ 610 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 611 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 612 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 613 upTriFactor->csrMat->column_indices->data().get(), 614 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 615 upTriFactor->solveInfo, 616 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 617 #else 618 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 619 #endif 620 cerr = WaitForCUDA();CHKERRCUDA(cerr); 621 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 622 623 /* assign the pointer */ 624 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 625 upTriFactor->AA_h = AAUp; 626 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 627 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 628 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 629 } else { 630 if (!upTriFactor->AA_h) { 631 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 632 } 633 /* Fill the upper triangular matrix */ 634 offset = nzUpper; 635 for (i=n-1; i>=0; i--) { 636 v = aa + adiag[i+1] + 1; 637 638 /* number of elements NOT on the diagonal */ 639 nz = adiag[i] - adiag[i+1]-1; 640 641 /* decrement the offset */ 642 offset -= (nz+1); 643 644 /* first, set the diagonal elements */ 645 upTriFactor->AA_h[offset] = 1./v[nz]; 646 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 647 } 648 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 649 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 650 } 651 } catch(char *ex) { 652 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 653 } 654 } 655 PetscFunctionReturn(0); 656 } 657 658 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 659 { 660 PetscErrorCode ierr; 661 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 662 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 663 IS isrow = a->row,iscol = a->icol; 664 PetscBool row_identity,col_identity; 665 PetscInt n = A->rmap->n; 666 667 PetscFunctionBegin; 668 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 669 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 670 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 671 672 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 673 cusparseTriFactors->nnz=a->nz; 674 675 A->offloadmask = PETSC_OFFLOAD_BOTH; 676 /* lower triangular indices */ 677 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 678 if (!row_identity && !cusparseTriFactors->rpermIndices) { 679 const PetscInt *r; 680 681 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 682 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 683 cusparseTriFactors->rpermIndices->assign(r, r+n); 684 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 685 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 686 } 687 688 /* upper triangular indices */ 689 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 690 if (!col_identity && !cusparseTriFactors->cpermIndices) { 691 const PetscInt *c; 692 693 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 694 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 695 cusparseTriFactors->cpermIndices->assign(c, c+n); 696 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 697 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 698 } 699 PetscFunctionReturn(0); 700 } 701 702 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 703 { 704 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 705 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 706 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 707 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 708 cusparseStatus_t stat; 709 PetscErrorCode ierr; 710 cudaError_t cerr; 711 PetscInt *AiUp, *AjUp; 712 PetscScalar *AAUp; 713 PetscScalar *AALo; 714 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 715 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 716 const PetscInt *ai = b->i,*aj = b->j,*vj; 717 const MatScalar *aa = b->a,*v; 718 719 PetscFunctionBegin; 720 if (!n) PetscFunctionReturn(0); 721 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 722 try { 723 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 724 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 725 if (!upTriFactor && !loTriFactor) { 726 /* Allocate Space for the upper triangular matrix */ 727 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 728 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 729 730 /* Fill the upper triangular matrix */ 731 AiUp[0]=(PetscInt) 0; 732 AiUp[n]=nzUpper; 733 offset = 0; 734 for (i=0; i<n; i++) { 735 /* set the pointers */ 736 v = aa + ai[i]; 737 vj = aj + ai[i]; 738 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 739 740 /* first, set the diagonal elements */ 741 AjUp[offset] = (PetscInt) i; 742 AAUp[offset] = (MatScalar)1.0/v[nz]; 743 AiUp[i] = offset; 744 AALo[offset] = (MatScalar)1.0/v[nz]; 745 746 offset+=1; 747 if (nz>0) { 748 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 749 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 750 for (j=offset; j<offset+nz; j++) { 751 AAUp[j] = -AAUp[j]; 752 AALo[j] = AAUp[j]/v[nz]; 753 } 754 offset+=nz; 755 } 756 } 757 758 /* allocate space for the triangular factor information */ 759 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 760 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 761 762 /* Create the matrix description */ 763 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 764 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 765 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 766 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 767 #else 768 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 769 #endif 770 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 771 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 772 773 /* set the matrix */ 774 upTriFactor->csrMat = new CsrMatrix; 775 upTriFactor->csrMat->num_rows = A->rmap->n; 776 upTriFactor->csrMat->num_cols = A->cmap->n; 777 upTriFactor->csrMat->num_entries = a->nz; 778 779 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 780 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 781 782 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 783 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 784 785 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 786 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 787 788 /* set the operation */ 789 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 790 791 /* Create the solve analysis information */ 792 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 793 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 794 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 795 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 796 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 797 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 798 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 799 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 800 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 801 #endif 802 803 /* perform the solve analysis */ 804 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 805 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 806 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 807 upTriFactor->csrMat->column_indices->data().get(), 808 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 809 upTriFactor->solveInfo, 810 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 811 #else 812 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 813 #endif 814 cerr = WaitForCUDA();CHKERRCUDA(cerr); 815 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 816 817 /* assign the pointer */ 818 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 819 820 /* allocate space for the triangular factor information */ 821 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 822 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 823 824 /* Create the matrix description */ 825 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 826 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 827 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 828 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 829 #else 830 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 831 #endif 832 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 833 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 834 835 /* set the operation */ 836 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 837 838 /* set the matrix */ 839 loTriFactor->csrMat = new CsrMatrix; 840 loTriFactor->csrMat->num_rows = A->rmap->n; 841 loTriFactor->csrMat->num_cols = A->cmap->n; 842 loTriFactor->csrMat->num_entries = a->nz; 843 844 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 845 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 846 847 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 848 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 849 850 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 851 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 852 853 /* Create the solve analysis information */ 854 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 855 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 856 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 857 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 858 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 859 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 860 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 861 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 862 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 863 #endif 864 865 /* perform the solve analysis */ 866 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 867 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 868 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 869 loTriFactor->csrMat->column_indices->data().get(), 870 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 871 loTriFactor->solveInfo, 872 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 873 #else 874 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 875 #endif 876 cerr = WaitForCUDA();CHKERRCUDA(cerr); 877 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 878 879 /* assign the pointer */ 880 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 881 882 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 883 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 884 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 885 } else { 886 /* Fill the upper triangular matrix */ 887 offset = 0; 888 for (i=0; i<n; i++) { 889 /* set the pointers */ 890 v = aa + ai[i]; 891 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 892 893 /* first, set the diagonal elements */ 894 AAUp[offset] = 1.0/v[nz]; 895 AALo[offset] = 1.0/v[nz]; 896 897 offset+=1; 898 if (nz>0) { 899 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 900 for (j=offset; j<offset+nz; j++) { 901 AAUp[j] = -AAUp[j]; 902 AALo[j] = AAUp[j]/v[nz]; 903 } 904 offset+=nz; 905 } 906 } 907 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 908 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 909 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 910 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 911 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 912 } 913 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 914 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 915 } catch(char *ex) { 916 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 917 } 918 } 919 PetscFunctionReturn(0); 920 } 921 922 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 923 { 924 PetscErrorCode ierr; 925 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 926 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 927 IS ip = a->row; 928 PetscBool perm_identity; 929 PetscInt n = A->rmap->n; 930 931 PetscFunctionBegin; 932 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 933 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 934 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 935 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 936 937 A->offloadmask = PETSC_OFFLOAD_BOTH; 938 939 /* lower triangular indices */ 940 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 941 if (!perm_identity) { 942 IS iip; 943 const PetscInt *irip,*rip; 944 945 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 946 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 947 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 948 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 949 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 950 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 951 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 952 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 953 ierr = ISDestroy(&iip);CHKERRQ(ierr); 954 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 955 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 956 } 957 PetscFunctionReturn(0); 958 } 959 960 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 961 { 962 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 963 IS ip = b->row; 964 PetscBool perm_identity; 965 PetscErrorCode ierr; 966 967 PetscFunctionBegin; 968 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 969 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 970 B->offloadmask = PETSC_OFFLOAD_CPU; 971 /* determine which version of MatSolve needs to be used. */ 972 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 973 if (perm_identity) { 974 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 975 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 976 B->ops->matsolve = NULL; 977 B->ops->matsolvetranspose = NULL; 978 } else { 979 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 980 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 981 B->ops->matsolve = NULL; 982 B->ops->matsolvetranspose = NULL; 983 } 984 985 /* get the triangular factors */ 986 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 987 PetscFunctionReturn(0); 988 } 989 990 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 991 { 992 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 993 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 994 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 995 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 996 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 997 cusparseStatus_t stat; 998 cusparseIndexBase_t indexBase; 999 cusparseMatrixType_t matrixType; 1000 cusparseFillMode_t fillMode; 1001 cusparseDiagType_t diagType; 1002 cudaError_t cerr; 1003 PetscErrorCode ierr; 1004 1005 PetscFunctionBegin; 1006 /* allocate space for the transpose of the lower triangular factor */ 1007 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1008 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1009 1010 /* set the matrix descriptors of the lower triangular factor */ 1011 matrixType = cusparseGetMatType(loTriFactor->descr); 1012 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1013 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1014 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1015 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1016 1017 /* Create the matrix description */ 1018 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1019 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1020 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1021 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1022 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1023 1024 /* set the operation */ 1025 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1026 1027 /* allocate GPU space for the CSC of the lower triangular factor*/ 1028 loTriFactorT->csrMat = new CsrMatrix; 1029 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1030 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1031 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1032 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1033 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1034 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1035 1036 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1037 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1038 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1039 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1040 loTriFactor->csrMat->values->data().get(), 1041 loTriFactor->csrMat->row_offsets->data().get(), 1042 loTriFactor->csrMat->column_indices->data().get(), 1043 loTriFactorT->csrMat->values->data().get(), 1044 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1045 CUSPARSE_ACTION_NUMERIC,indexBase, 1046 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1047 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1048 #endif 1049 1050 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1051 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1052 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1053 loTriFactor->csrMat->values->data().get(), 1054 loTriFactor->csrMat->row_offsets->data().get(), 1055 loTriFactor->csrMat->column_indices->data().get(), 1056 loTriFactorT->csrMat->values->data().get(), 1057 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1058 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1059 CUSPARSE_ACTION_NUMERIC, indexBase, 1060 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1061 #else 1062 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1063 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1064 #endif 1065 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1066 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1067 1068 /* Create the solve analysis information */ 1069 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1070 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1071 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1072 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1073 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1074 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1075 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1076 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1077 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1078 #endif 1079 1080 /* perform the solve analysis */ 1081 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1082 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1083 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1084 loTriFactorT->csrMat->column_indices->data().get(), 1085 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1086 loTriFactorT->solveInfo, 1087 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1088 #else 1089 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1090 #endif 1091 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1092 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1093 1094 /* assign the pointer */ 1095 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1096 1097 /*********************************************/ 1098 /* Now the Transpose of the Upper Tri Factor */ 1099 /*********************************************/ 1100 1101 /* allocate space for the transpose of the upper triangular factor */ 1102 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1103 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1104 1105 /* set the matrix descriptors of the upper triangular factor */ 1106 matrixType = cusparseGetMatType(upTriFactor->descr); 1107 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1108 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1109 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1110 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1111 1112 /* Create the matrix description */ 1113 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1114 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1115 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1116 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1117 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1118 1119 /* set the operation */ 1120 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1121 1122 /* allocate GPU space for the CSC of the upper triangular factor*/ 1123 upTriFactorT->csrMat = new CsrMatrix; 1124 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1125 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1126 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1127 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1128 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1129 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1130 1131 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1132 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1133 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1134 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1135 upTriFactor->csrMat->values->data().get(), 1136 upTriFactor->csrMat->row_offsets->data().get(), 1137 upTriFactor->csrMat->column_indices->data().get(), 1138 upTriFactorT->csrMat->values->data().get(), 1139 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1140 CUSPARSE_ACTION_NUMERIC,indexBase, 1141 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1142 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1143 #endif 1144 1145 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1146 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1147 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1148 upTriFactor->csrMat->values->data().get(), 1149 upTriFactor->csrMat->row_offsets->data().get(), 1150 upTriFactor->csrMat->column_indices->data().get(), 1151 upTriFactorT->csrMat->values->data().get(), 1152 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1153 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1154 CUSPARSE_ACTION_NUMERIC, indexBase, 1155 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1156 #else 1157 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1158 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1159 #endif 1160 1161 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1162 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1163 1164 /* Create the solve analysis information */ 1165 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1166 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1167 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1168 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1169 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1170 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1171 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1172 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1173 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1174 #endif 1175 1176 /* perform the solve analysis */ 1177 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1178 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1179 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1180 upTriFactorT->csrMat->column_indices->data().get(), 1181 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1182 upTriFactorT->solveInfo, 1183 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1184 #else 1185 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1186 #endif 1187 1188 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1189 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1190 1191 /* assign the pointer */ 1192 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1193 PetscFunctionReturn(0); 1194 } 1195 1196 struct PetscScalarToPetscInt 1197 { 1198 __host__ __device__ 1199 PetscInt operator()(PetscScalar s) 1200 { 1201 return (PetscInt)PetscRealPart(s); 1202 } 1203 }; 1204 1205 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A) 1206 { 1207 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1208 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1209 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1210 cusparseStatus_t stat; 1211 cusparseIndexBase_t indexBase; 1212 cudaError_t err; 1213 PetscErrorCode ierr; 1214 1215 PetscFunctionBegin; 1216 if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 1217 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1218 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1219 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1220 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1221 if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1222 if (A->transupdated) PetscFunctionReturn(0); 1223 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1224 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1225 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1226 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1227 } 1228 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1229 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1230 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1231 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1232 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1233 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1234 1235 /* set alpha and beta */ 1236 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1237 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1238 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1239 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1240 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1241 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1242 1243 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1244 CsrMatrix *matrixT = new CsrMatrix; 1245 matstructT->mat = matrixT; 1246 matrixT->num_rows = A->cmap->n; 1247 matrixT->num_cols = A->rmap->n; 1248 matrixT->num_entries = a->nz; 1249 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1250 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1251 matrixT->values = new THRUSTARRAY(a->nz); 1252 1253 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1254 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1255 1256 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1257 stat = cusparseCreateCsr(&matstructT->matDescr, 1258 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1259 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1260 matrixT->values->data().get(), 1261 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1262 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1263 #endif 1264 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1265 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1266 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1267 #else 1268 CsrMatrix *temp = new CsrMatrix; 1269 CsrMatrix *tempT = new CsrMatrix; 1270 /* First convert HYB to CSR */ 1271 temp->num_rows = A->rmap->n; 1272 temp->num_cols = A->cmap->n; 1273 temp->num_entries = a->nz; 1274 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1275 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1276 temp->values = new THRUSTARRAY(a->nz); 1277 1278 stat = cusparse_hyb2csr(cusparsestruct->handle, 1279 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1280 temp->values->data().get(), 1281 temp->row_offsets->data().get(), 1282 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1283 1284 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1285 tempT->num_rows = A->rmap->n; 1286 tempT->num_cols = A->cmap->n; 1287 tempT->num_entries = a->nz; 1288 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1289 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1290 tempT->values = new THRUSTARRAY(a->nz); 1291 1292 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1293 temp->num_cols, temp->num_entries, 1294 temp->values->data().get(), 1295 temp->row_offsets->data().get(), 1296 temp->column_indices->data().get(), 1297 tempT->values->data().get(), 1298 tempT->column_indices->data().get(), 1299 tempT->row_offsets->data().get(), 1300 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1301 1302 /* Last, convert CSC to HYB */ 1303 cusparseHybMat_t hybMat; 1304 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1305 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1306 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1307 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1308 matstructT->descr, tempT->values->data().get(), 1309 tempT->row_offsets->data().get(), 1310 tempT->column_indices->data().get(), 1311 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1312 1313 /* assign the pointer */ 1314 matstructT->mat = hybMat; 1315 A->transupdated = PETSC_TRUE; 1316 /* delete temporaries */ 1317 if (tempT) { 1318 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1319 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1320 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1321 delete (CsrMatrix*) tempT; 1322 } 1323 if (temp) { 1324 if (temp->values) delete (THRUSTARRAY*) temp->values; 1325 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1326 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1327 delete (CsrMatrix*) temp; 1328 } 1329 #endif 1330 } 1331 } 1332 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1333 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1334 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1335 if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1336 if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1337 if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1338 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1339 if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1340 if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1341 if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1342 if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1343 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1344 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1345 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1346 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1347 } 1348 if (!cusparsestruct->csr2csc_i) { 1349 THRUSTARRAY csr2csc_a(matrix->num_entries); 1350 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1351 1352 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1353 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1354 void *csr2cscBuffer; 1355 size_t csr2cscBufferSize; 1356 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1357 A->cmap->n, matrix->num_entries, 1358 matrix->values->data().get(), 1359 cusparsestruct->rowoffsets_gpu->data().get(), 1360 matrix->column_indices->data().get(), 1361 matrixT->values->data().get(), 1362 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1363 CUSPARSE_ACTION_NUMERIC,indexBase, 1364 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1365 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1366 #endif 1367 1368 if (matrix->num_entries) { 1369 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1370 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1371 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1372 1373 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1374 should be filled with indexBase. So I just take a shortcut here. 1375 */ 1376 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1377 A->cmap->n,matrix->num_entries, 1378 csr2csc_a.data().get(), 1379 cusparsestruct->rowoffsets_gpu->data().get(), 1380 matrix->column_indices->data().get(), 1381 matrixT->values->data().get(), 1382 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1383 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1384 CUSPARSE_ACTION_NUMERIC,indexBase, 1385 cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1386 #else 1387 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1388 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1389 #endif 1390 } else { 1391 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1392 } 1393 1394 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1395 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1396 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1397 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1398 #endif 1399 } 1400 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1401 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1402 matrixT->values->begin())); 1403 } 1404 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1405 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1406 /* the compressed row indices is not used for matTranspose */ 1407 matstructT->cprowIndices = NULL; 1408 /* assign the pointer */ 1409 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1410 A->transupdated = PETSC_TRUE; 1411 PetscFunctionReturn(0); 1412 } 1413 1414 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1415 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1416 { 1417 PetscInt n = xx->map->n; 1418 const PetscScalar *barray; 1419 PetscScalar *xarray; 1420 thrust::device_ptr<const PetscScalar> bGPU; 1421 thrust::device_ptr<PetscScalar> xGPU; 1422 cusparseStatus_t stat; 1423 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1424 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1425 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1426 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1427 PetscErrorCode ierr; 1428 1429 PetscFunctionBegin; 1430 /* Analyze the matrix and create the transpose ... on the fly */ 1431 if (!loTriFactorT && !upTriFactorT) { 1432 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1433 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1434 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1435 } 1436 1437 /* Get the GPU pointers */ 1438 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1439 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1440 xGPU = thrust::device_pointer_cast(xarray); 1441 bGPU = thrust::device_pointer_cast(barray); 1442 1443 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1444 /* First, reorder with the row permutation */ 1445 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1446 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1447 xGPU); 1448 1449 /* First, solve U */ 1450 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1451 upTriFactorT->csrMat->num_rows, 1452 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1453 upTriFactorT->csrMat->num_entries, 1454 #endif 1455 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1456 upTriFactorT->csrMat->values->data().get(), 1457 upTriFactorT->csrMat->row_offsets->data().get(), 1458 upTriFactorT->csrMat->column_indices->data().get(), 1459 upTriFactorT->solveInfo, 1460 xarray, 1461 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1462 tempGPU->data().get(), 1463 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1464 #else 1465 tempGPU->data().get());CHKERRCUSPARSE(stat); 1466 #endif 1467 1468 /* Then, solve L */ 1469 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1470 loTriFactorT->csrMat->num_rows, 1471 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1472 loTriFactorT->csrMat->num_entries, 1473 #endif 1474 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1475 loTriFactorT->csrMat->values->data().get(), 1476 loTriFactorT->csrMat->row_offsets->data().get(), 1477 loTriFactorT->csrMat->column_indices->data().get(), 1478 loTriFactorT->solveInfo, 1479 tempGPU->data().get(), 1480 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1481 xarray, 1482 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1483 #else 1484 xarray);CHKERRCUSPARSE(stat); 1485 #endif 1486 1487 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1488 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1489 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1490 tempGPU->begin()); 1491 1492 /* Copy the temporary to the full solution. */ 1493 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1494 1495 /* restore */ 1496 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1497 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1498 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1499 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1500 PetscFunctionReturn(0); 1501 } 1502 1503 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1504 { 1505 const PetscScalar *barray; 1506 PetscScalar *xarray; 1507 cusparseStatus_t stat; 1508 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1509 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1510 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1511 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1512 PetscErrorCode ierr; 1513 1514 PetscFunctionBegin; 1515 /* Analyze the matrix and create the transpose ... on the fly */ 1516 if (!loTriFactorT && !upTriFactorT) { 1517 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1518 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1519 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1520 } 1521 1522 /* Get the GPU pointers */ 1523 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1524 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1525 1526 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1527 /* First, solve U */ 1528 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1529 upTriFactorT->csrMat->num_rows, 1530 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1531 upTriFactorT->csrMat->num_entries, 1532 #endif 1533 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1534 upTriFactorT->csrMat->values->data().get(), 1535 upTriFactorT->csrMat->row_offsets->data().get(), 1536 upTriFactorT->csrMat->column_indices->data().get(), 1537 upTriFactorT->solveInfo, 1538 barray, 1539 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1540 tempGPU->data().get(), 1541 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1542 #else 1543 tempGPU->data().get());CHKERRCUSPARSE(stat); 1544 #endif 1545 1546 /* Then, solve L */ 1547 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1548 loTriFactorT->csrMat->num_rows, 1549 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1550 loTriFactorT->csrMat->num_entries, 1551 #endif 1552 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1553 loTriFactorT->csrMat->values->data().get(), 1554 loTriFactorT->csrMat->row_offsets->data().get(), 1555 loTriFactorT->csrMat->column_indices->data().get(), 1556 loTriFactorT->solveInfo, 1557 tempGPU->data().get(), 1558 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1559 xarray, 1560 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1561 #else 1562 xarray);CHKERRCUSPARSE(stat); 1563 #endif 1564 1565 /* restore */ 1566 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1567 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1568 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1569 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1570 PetscFunctionReturn(0); 1571 } 1572 1573 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1574 { 1575 const PetscScalar *barray; 1576 PetscScalar *xarray; 1577 thrust::device_ptr<const PetscScalar> bGPU; 1578 thrust::device_ptr<PetscScalar> xGPU; 1579 cusparseStatus_t stat; 1580 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1581 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1582 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1583 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1584 PetscErrorCode ierr; 1585 1586 PetscFunctionBegin; 1587 1588 /* Get the GPU pointers */ 1589 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1590 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1591 xGPU = thrust::device_pointer_cast(xarray); 1592 bGPU = thrust::device_pointer_cast(barray); 1593 1594 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1595 /* First, reorder with the row permutation */ 1596 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1597 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1598 tempGPU->begin()); 1599 1600 /* Next, solve L */ 1601 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1602 loTriFactor->csrMat->num_rows, 1603 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1604 loTriFactor->csrMat->num_entries, 1605 #endif 1606 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1607 loTriFactor->csrMat->values->data().get(), 1608 loTriFactor->csrMat->row_offsets->data().get(), 1609 loTriFactor->csrMat->column_indices->data().get(), 1610 loTriFactor->solveInfo, 1611 tempGPU->data().get(), 1612 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1613 xarray, 1614 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1615 #else 1616 xarray);CHKERRCUSPARSE(stat); 1617 #endif 1618 1619 /* Then, solve U */ 1620 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1621 upTriFactor->csrMat->num_rows, 1622 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1623 upTriFactor->csrMat->num_entries, 1624 #endif 1625 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1626 upTriFactor->csrMat->values->data().get(), 1627 upTriFactor->csrMat->row_offsets->data().get(), 1628 upTriFactor->csrMat->column_indices->data().get(), 1629 upTriFactor->solveInfo,xarray, 1630 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1631 tempGPU->data().get(), 1632 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1633 #else 1634 tempGPU->data().get());CHKERRCUSPARSE(stat); 1635 #endif 1636 1637 /* Last, reorder with the column permutation */ 1638 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1639 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1640 xGPU); 1641 1642 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1643 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1644 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1645 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1646 PetscFunctionReturn(0); 1647 } 1648 1649 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1650 { 1651 const PetscScalar *barray; 1652 PetscScalar *xarray; 1653 cusparseStatus_t stat; 1654 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1655 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1656 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1657 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1658 PetscErrorCode ierr; 1659 1660 PetscFunctionBegin; 1661 /* Get the GPU pointers */ 1662 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1663 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1664 1665 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1666 /* First, solve L */ 1667 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1668 loTriFactor->csrMat->num_rows, 1669 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1670 loTriFactor->csrMat->num_entries, 1671 #endif 1672 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1673 loTriFactor->csrMat->values->data().get(), 1674 loTriFactor->csrMat->row_offsets->data().get(), 1675 loTriFactor->csrMat->column_indices->data().get(), 1676 loTriFactor->solveInfo, 1677 barray, 1678 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1679 tempGPU->data().get(), 1680 loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1681 #else 1682 tempGPU->data().get());CHKERRCUSPARSE(stat); 1683 #endif 1684 1685 /* Next, solve U */ 1686 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1687 upTriFactor->csrMat->num_rows, 1688 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1689 upTriFactor->csrMat->num_entries, 1690 #endif 1691 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1692 upTriFactor->csrMat->values->data().get(), 1693 upTriFactor->csrMat->row_offsets->data().get(), 1694 upTriFactor->csrMat->column_indices->data().get(), 1695 upTriFactor->solveInfo, 1696 tempGPU->data().get(), 1697 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1698 xarray, 1699 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1700 #else 1701 xarray);CHKERRCUSPARSE(stat); 1702 #endif 1703 1704 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1705 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1706 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1707 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1708 PetscFunctionReturn(0); 1709 } 1710 1711 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1712 { 1713 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1714 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1715 cudaError_t cerr; 1716 PetscErrorCode ierr; 1717 1718 PetscFunctionBegin; 1719 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1720 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1721 1722 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1723 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1724 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1725 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1726 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1727 A->offloadmask = PETSC_OFFLOAD_BOTH; 1728 } 1729 PetscFunctionReturn(0); 1730 } 1731 1732 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1733 { 1734 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1735 PetscErrorCode ierr; 1736 1737 PetscFunctionBegin; 1738 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1739 *array = a->a; 1740 A->offloadmask = PETSC_OFFLOAD_CPU; 1741 PetscFunctionReturn(0); 1742 } 1743 1744 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1745 { 1746 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1747 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1748 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1749 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1750 PetscErrorCode ierr; 1751 cusparseStatus_t stat; 1752 PetscBool both = PETSC_TRUE; 1753 cudaError_t err; 1754 1755 PetscFunctionBegin; 1756 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1757 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1758 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1759 CsrMatrix *matrix; 1760 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1761 1762 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1763 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1764 matrix->values->assign(a->a, a->a+a->nz); 1765 err = WaitForCUDA();CHKERRCUDA(err); 1766 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1767 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1768 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1769 } else { 1770 PetscInt nnz; 1771 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1772 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1773 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1774 delete cusparsestruct->workVector; 1775 delete cusparsestruct->rowoffsets_gpu; 1776 cusparsestruct->workVector = NULL; 1777 cusparsestruct->rowoffsets_gpu = NULL; 1778 try { 1779 if (a->compressedrow.use) { 1780 m = a->compressedrow.nrows; 1781 ii = a->compressedrow.i; 1782 ridx = a->compressedrow.rindex; 1783 } else { 1784 m = A->rmap->n; 1785 ii = a->i; 1786 ridx = NULL; 1787 } 1788 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1789 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1790 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1791 else nnz = a->nz; 1792 1793 /* create cusparse matrix */ 1794 cusparsestruct->nrows = m; 1795 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1796 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1797 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1798 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1799 1800 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1801 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1802 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1803 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1804 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1805 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1806 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1807 1808 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1809 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1810 /* set the matrix */ 1811 CsrMatrix *mat= new CsrMatrix; 1812 mat->num_rows = m; 1813 mat->num_cols = A->cmap->n; 1814 mat->num_entries = nnz; 1815 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1816 mat->row_offsets->assign(ii, ii + m+1); 1817 1818 mat->column_indices = new THRUSTINTARRAY32(nnz); 1819 mat->column_indices->assign(a->j, a->j+nnz); 1820 1821 mat->values = new THRUSTARRAY(nnz); 1822 if (a->a) mat->values->assign(a->a, a->a+nnz); 1823 1824 /* assign the pointer */ 1825 matstruct->mat = mat; 1826 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1827 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1828 stat = cusparseCreateCsr(&matstruct->matDescr, 1829 mat->num_rows, mat->num_cols, mat->num_entries, 1830 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1831 mat->values->data().get(), 1832 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1833 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1834 } 1835 #endif 1836 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1837 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1838 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1839 #else 1840 CsrMatrix *mat= new CsrMatrix; 1841 mat->num_rows = m; 1842 mat->num_cols = A->cmap->n; 1843 mat->num_entries = nnz; 1844 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1845 mat->row_offsets->assign(ii, ii + m+1); 1846 1847 mat->column_indices = new THRUSTINTARRAY32(nnz); 1848 mat->column_indices->assign(a->j, a->j+nnz); 1849 1850 mat->values = new THRUSTARRAY(nnz); 1851 if (a->a) mat->values->assign(a->a, a->a+nnz); 1852 1853 cusparseHybMat_t hybMat; 1854 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1855 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1856 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1857 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1858 matstruct->descr, mat->values->data().get(), 1859 mat->row_offsets->data().get(), 1860 mat->column_indices->data().get(), 1861 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1862 /* assign the pointer */ 1863 matstruct->mat = hybMat; 1864 1865 if (mat) { 1866 if (mat->values) delete (THRUSTARRAY*)mat->values; 1867 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1868 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1869 delete (CsrMatrix*)mat; 1870 } 1871 #endif 1872 } 1873 1874 /* assign the compressed row indices */ 1875 if (a->compressedrow.use) { 1876 cusparsestruct->workVector = new THRUSTARRAY(m); 1877 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1878 matstruct->cprowIndices->assign(ridx,ridx+m); 1879 tmp = m; 1880 } else { 1881 cusparsestruct->workVector = NULL; 1882 matstruct->cprowIndices = NULL; 1883 tmp = 0; 1884 } 1885 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1886 1887 /* assign the pointer */ 1888 cusparsestruct->mat = matstruct; 1889 } catch(char *ex) { 1890 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1891 } 1892 err = WaitForCUDA();CHKERRCUDA(err); 1893 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1894 cusparsestruct->nonzerostate = A->nonzerostate; 1895 } 1896 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1897 } 1898 PetscFunctionReturn(0); 1899 } 1900 1901 struct VecCUDAPlusEquals 1902 { 1903 template <typename Tuple> 1904 __host__ __device__ 1905 void operator()(Tuple t) 1906 { 1907 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1908 } 1909 }; 1910 1911 struct VecCUDAEquals 1912 { 1913 template <typename Tuple> 1914 __host__ __device__ 1915 void operator()(Tuple t) 1916 { 1917 thrust::get<1>(t) = thrust::get<0>(t); 1918 } 1919 }; 1920 1921 struct VecCUDAEqualsReverse 1922 { 1923 template <typename Tuple> 1924 __host__ __device__ 1925 void operator()(Tuple t) 1926 { 1927 thrust::get<0>(t) = thrust::get<1>(t); 1928 } 1929 }; 1930 1931 struct MatMatCusparse { 1932 PetscBool cisdense; 1933 PetscScalar *Bt; 1934 Mat X; 1935 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1936 PetscLogDouble flops; 1937 CsrMatrix *Bcsr; 1938 1939 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1940 cusparseSpMatDescr_t matSpBDescr; 1941 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1942 cusparseDnMatDescr_t matBDescr; 1943 cusparseDnMatDescr_t matCDescr; 1944 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1945 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 1946 void *dBuffer4; 1947 void *dBuffer5; 1948 #endif 1949 size_t mmBufferSize; 1950 void *mmBuffer; 1951 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1952 cusparseSpGEMMDescr_t spgemmDesc; 1953 #endif 1954 }; 1955 1956 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1957 { 1958 PetscErrorCode ierr; 1959 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1960 cudaError_t cerr; 1961 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1962 cusparseStatus_t stat; 1963 #endif 1964 1965 PetscFunctionBegin; 1966 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1967 delete mmdata->Bcsr; 1968 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1969 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1970 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1971 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1972 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1973 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 1974 if (mmdata->dBuffer4) { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); } 1975 if (mmdata->dBuffer5) { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); } 1976 #endif 1977 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1978 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1979 #endif 1980 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1981 ierr = PetscFree(data);CHKERRQ(ierr); 1982 PetscFunctionReturn(0); 1983 } 1984 1985 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1986 1987 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1988 { 1989 Mat_Product *product = C->product; 1990 Mat A,B; 1991 PetscInt m,n,blda,clda; 1992 PetscBool flg,biscuda; 1993 Mat_SeqAIJCUSPARSE *cusp; 1994 cusparseStatus_t stat; 1995 cusparseOperation_t opA; 1996 const PetscScalar *barray; 1997 PetscScalar *carray; 1998 PetscErrorCode ierr; 1999 MatMatCusparse *mmdata; 2000 Mat_SeqAIJCUSPARSEMultStruct *mat; 2001 CsrMatrix *csrmat; 2002 2003 PetscFunctionBegin; 2004 MatCheckProduct(C,1); 2005 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2006 mmdata = (MatMatCusparse*)product->data; 2007 A = product->A; 2008 B = product->B; 2009 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2010 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2011 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2012 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2013 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2014 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2015 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2016 switch (product->type) { 2017 case MATPRODUCT_AB: 2018 case MATPRODUCT_PtAP: 2019 mat = cusp->mat; 2020 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2021 m = A->rmap->n; 2022 n = B->cmap->n; 2023 break; 2024 case MATPRODUCT_AtB: 2025 if (!A->form_explicit_transpose) { 2026 mat = cusp->mat; 2027 opA = CUSPARSE_OPERATION_TRANSPOSE; 2028 } else { 2029 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2030 mat = cusp->matTranspose; 2031 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2032 } 2033 m = A->cmap->n; 2034 n = B->cmap->n; 2035 break; 2036 case MATPRODUCT_ABt: 2037 case MATPRODUCT_RARt: 2038 mat = cusp->mat; 2039 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2040 m = A->rmap->n; 2041 n = B->rmap->n; 2042 break; 2043 default: 2044 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2045 } 2046 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2047 csrmat = (CsrMatrix*)mat->mat; 2048 /* if the user passed a CPU matrix, copy the data to the GPU */ 2049 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2050 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2051 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2052 2053 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2054 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2055 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2056 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2057 } else { 2058 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2059 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2060 } 2061 2062 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2063 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2064 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2065 /* (re)allcoate mmBuffer if not initialized or LDAs are different */ 2066 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2067 size_t mmBufferSize; 2068 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2069 if (!mmdata->matBDescr) { 2070 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2071 mmdata->Blda = blda; 2072 } 2073 2074 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2075 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2076 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2077 mmdata->Clda = clda; 2078 } 2079 2080 if (!mat->matDescr) { 2081 stat = cusparseCreateCsr(&mat->matDescr, 2082 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2083 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2084 csrmat->values->data().get(), 2085 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2086 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2087 } 2088 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2089 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2090 mmdata->matCDescr,cusparse_scalartype, 2091 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2092 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2093 cudaError_t cerr; 2094 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2095 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2096 mmdata->mmBufferSize = mmBufferSize; 2097 } 2098 mmdata->initialized = PETSC_TRUE; 2099 } else { 2100 /* to be safe, always update pointers of the mats */ 2101 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2102 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2103 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2104 } 2105 2106 /* do cusparseSpMM, which supports transpose on B */ 2107 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2108 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2109 mmdata->matCDescr,cusparse_scalartype, 2110 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2111 #else 2112 PetscInt k; 2113 /* cusparseXcsrmm does not support transpose on B */ 2114 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2115 cublasHandle_t cublasv2handle; 2116 cublasStatus_t cerr; 2117 2118 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2119 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2120 B->cmap->n,B->rmap->n, 2121 &PETSC_CUSPARSE_ONE ,barray,blda, 2122 &PETSC_CUSPARSE_ZERO,barray,blda, 2123 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2124 blda = B->cmap->n; 2125 k = B->cmap->n; 2126 } else { 2127 k = B->rmap->n; 2128 } 2129 2130 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2131 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2132 csrmat->num_entries,mat->alpha_one,mat->descr, 2133 csrmat->values->data().get(), 2134 csrmat->row_offsets->data().get(), 2135 csrmat->column_indices->data().get(), 2136 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2137 carray,clda);CHKERRCUSPARSE(stat); 2138 #endif 2139 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2140 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2141 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2142 if (product->type == MATPRODUCT_RARt) { 2143 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2144 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2145 } else if (product->type == MATPRODUCT_PtAP) { 2146 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2147 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2148 } else { 2149 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2150 } 2151 if (mmdata->cisdense) { 2152 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2153 } 2154 if (!biscuda) { 2155 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2156 } 2157 PetscFunctionReturn(0); 2158 } 2159 2160 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2161 { 2162 Mat_Product *product = C->product; 2163 Mat A,B; 2164 PetscInt m,n; 2165 PetscBool cisdense,flg; 2166 PetscErrorCode ierr; 2167 MatMatCusparse *mmdata; 2168 Mat_SeqAIJCUSPARSE *cusp; 2169 2170 PetscFunctionBegin; 2171 MatCheckProduct(C,1); 2172 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2173 A = product->A; 2174 B = product->B; 2175 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2176 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2177 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2178 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2179 switch (product->type) { 2180 case MATPRODUCT_AB: 2181 m = A->rmap->n; 2182 n = B->cmap->n; 2183 break; 2184 case MATPRODUCT_AtB: 2185 m = A->cmap->n; 2186 n = B->cmap->n; 2187 break; 2188 case MATPRODUCT_ABt: 2189 m = A->rmap->n; 2190 n = B->rmap->n; 2191 break; 2192 case MATPRODUCT_PtAP: 2193 m = B->cmap->n; 2194 n = B->cmap->n; 2195 break; 2196 case MATPRODUCT_RARt: 2197 m = B->rmap->n; 2198 n = B->rmap->n; 2199 break; 2200 default: 2201 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2202 } 2203 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2204 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2205 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2206 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2207 2208 /* product data */ 2209 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2210 mmdata->cisdense = cisdense; 2211 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2212 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2213 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2214 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2215 } 2216 #endif 2217 /* for these products we need intermediate storage */ 2218 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2219 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2220 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2221 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2222 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2223 } else { 2224 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2225 } 2226 } 2227 C->product->data = mmdata; 2228 C->product->destroy = MatDestroy_MatMatCusparse; 2229 2230 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2231 PetscFunctionReturn(0); 2232 } 2233 2234 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2235 { 2236 Mat_Product *product = C->product; 2237 Mat A,B; 2238 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2239 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2240 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2241 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2242 PetscBool flg; 2243 PetscErrorCode ierr; 2244 cusparseStatus_t stat; 2245 cudaError_t cerr; 2246 MatProductType ptype; 2247 MatMatCusparse *mmdata; 2248 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2249 cusparseSpMatDescr_t BmatSpDescr; 2250 #endif 2251 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2252 2253 PetscFunctionBegin; 2254 MatCheckProduct(C,1); 2255 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2256 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2257 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2258 mmdata = (MatMatCusparse*)C->product->data; 2259 A = product->A; 2260 B = product->B; 2261 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2262 mmdata->reusesym = PETSC_FALSE; 2263 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2264 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2265 Cmat = Ccusp->mat; 2266 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2267 Ccsr = (CsrMatrix*)Cmat->mat; 2268 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2269 goto finalize; 2270 } 2271 if (!c->nz) goto finalize; 2272 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2273 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2274 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2275 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2276 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2277 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2278 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2279 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2280 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2281 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2282 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2283 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2284 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2285 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2286 2287 ptype = product->type; 2288 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2289 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2290 switch (ptype) { 2291 case MATPRODUCT_AB: 2292 Amat = Acusp->mat; 2293 Bmat = Bcusp->mat; 2294 break; 2295 case MATPRODUCT_AtB: 2296 Amat = Acusp->matTranspose; 2297 Bmat = Bcusp->mat; 2298 break; 2299 case MATPRODUCT_ABt: 2300 Amat = Acusp->mat; 2301 Bmat = Bcusp->matTranspose; 2302 break; 2303 default: 2304 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2305 } 2306 Cmat = Ccusp->mat; 2307 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2308 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2309 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2310 Acsr = (CsrMatrix*)Amat->mat; 2311 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2312 Ccsr = (CsrMatrix*)Cmat->mat; 2313 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2314 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2315 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2316 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2317 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2318 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2319 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2320 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2321 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2322 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2323 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2324 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2325 #else 2326 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2327 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2328 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2329 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2330 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2331 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2332 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2333 #endif 2334 #else 2335 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2336 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2337 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2338 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2339 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2340 #endif 2341 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2342 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2343 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2344 C->offloadmask = PETSC_OFFLOAD_GPU; 2345 finalize: 2346 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2347 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2348 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2349 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2350 c->reallocs = 0; 2351 C->info.mallocs += 0; 2352 C->info.nz_unneeded = 0; 2353 C->assembled = C->was_assembled = PETSC_TRUE; 2354 C->num_ass++; 2355 PetscFunctionReturn(0); 2356 } 2357 2358 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2359 { 2360 Mat_Product *product = C->product; 2361 Mat A,B; 2362 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2363 Mat_SeqAIJ *a,*b,*c; 2364 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2365 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2366 PetscInt i,j,m,n,k; 2367 PetscBool flg; 2368 PetscErrorCode ierr; 2369 cusparseStatus_t stat; 2370 cudaError_t cerr; 2371 MatProductType ptype; 2372 MatMatCusparse *mmdata; 2373 PetscLogDouble flops; 2374 PetscBool biscompressed,ciscompressed; 2375 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2376 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2377 cusparseSpMatDescr_t BmatSpDescr; 2378 #else 2379 int cnz; 2380 #endif 2381 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2382 2383 PetscFunctionBegin; 2384 MatCheckProduct(C,1); 2385 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2386 A = product->A; 2387 B = product->B; 2388 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2389 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2390 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2391 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2392 a = (Mat_SeqAIJ*)A->data; 2393 b = (Mat_SeqAIJ*)B->data; 2394 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2395 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2396 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2397 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2398 2399 /* product data */ 2400 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2401 C->product->data = mmdata; 2402 C->product->destroy = MatDestroy_MatMatCusparse; 2403 2404 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2405 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2406 ptype = product->type; 2407 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2408 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2409 biscompressed = PETSC_FALSE; 2410 ciscompressed = PETSC_FALSE; 2411 switch (ptype) { 2412 case MATPRODUCT_AB: 2413 m = A->rmap->n; 2414 n = B->cmap->n; 2415 k = A->cmap->n; 2416 Amat = Acusp->mat; 2417 Bmat = Bcusp->mat; 2418 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2419 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2420 break; 2421 case MATPRODUCT_AtB: 2422 m = A->cmap->n; 2423 n = B->cmap->n; 2424 k = A->rmap->n; 2425 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2426 Amat = Acusp->matTranspose; 2427 Bmat = Bcusp->mat; 2428 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2429 break; 2430 case MATPRODUCT_ABt: 2431 m = A->rmap->n; 2432 n = B->rmap->n; 2433 k = A->cmap->n; 2434 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 2435 Amat = Acusp->mat; 2436 Bmat = Bcusp->matTranspose; 2437 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2438 break; 2439 default: 2440 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2441 } 2442 2443 /* create cusparse matrix */ 2444 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2445 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2446 c = (Mat_SeqAIJ*)C->data; 2447 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2448 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2449 Ccsr = new CsrMatrix; 2450 2451 c->compressedrow.use = ciscompressed; 2452 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2453 c->compressedrow.nrows = a->compressedrow.nrows; 2454 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2455 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2456 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2457 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2458 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2459 } else { 2460 c->compressedrow.nrows = 0; 2461 c->compressedrow.i = NULL; 2462 c->compressedrow.rindex = NULL; 2463 Ccusp->workVector = NULL; 2464 Cmat->cprowIndices = NULL; 2465 } 2466 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2467 Ccusp->mat = Cmat; 2468 Ccusp->mat->mat = Ccsr; 2469 Ccsr->num_rows = Ccusp->nrows; 2470 Ccsr->num_cols = n; 2471 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2472 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2473 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2474 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2475 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2476 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2477 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2478 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2479 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2480 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2481 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2482 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2483 c->nz = 0; 2484 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2485 Ccsr->values = new THRUSTARRAY(c->nz); 2486 goto finalizesym; 2487 } 2488 2489 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2490 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2491 Acsr = (CsrMatrix*)Amat->mat; 2492 if (!biscompressed) { 2493 Bcsr = (CsrMatrix*)Bmat->mat; 2494 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2495 BmatSpDescr = Bmat->matDescr; 2496 #endif 2497 } else { /* we need to use row offsets for the full matrix */ 2498 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2499 Bcsr = new CsrMatrix; 2500 Bcsr->num_rows = B->rmap->n; 2501 Bcsr->num_cols = cBcsr->num_cols; 2502 Bcsr->num_entries = cBcsr->num_entries; 2503 Bcsr->column_indices = cBcsr->column_indices; 2504 Bcsr->values = cBcsr->values; 2505 if (!Bcusp->rowoffsets_gpu) { 2506 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2507 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2508 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2509 } 2510 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2511 mmdata->Bcsr = Bcsr; 2512 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2513 if (Bcsr->num_rows && Bcsr->num_cols) { 2514 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2515 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2516 Bcsr->values->data().get(), 2517 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2518 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2519 } 2520 BmatSpDescr = mmdata->matSpBDescr; 2521 #endif 2522 } 2523 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2524 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2525 /* precompute flops count */ 2526 if (ptype == MATPRODUCT_AB) { 2527 for (i=0, flops = 0; i<A->rmap->n; i++) { 2528 const PetscInt st = a->i[i]; 2529 const PetscInt en = a->i[i+1]; 2530 for (j=st; j<en; j++) { 2531 const PetscInt brow = a->j[j]; 2532 flops += 2.*(b->i[brow+1] - b->i[brow]); 2533 } 2534 } 2535 } else if (ptype == MATPRODUCT_AtB) { 2536 for (i=0, flops = 0; i<A->rmap->n; i++) { 2537 const PetscInt anzi = a->i[i+1] - a->i[i]; 2538 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2539 flops += (2.*anzi)*bnzi; 2540 } 2541 } else { /* TODO */ 2542 flops = 0.; 2543 } 2544 2545 mmdata->flops = flops; 2546 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2547 2548 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2549 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2550 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2551 NULL, NULL, NULL, 2552 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2553 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2554 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2555 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2556 { 2557 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2558 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2559 */ 2560 void* dBuffer1 = NULL; 2561 void* dBuffer2 = NULL; 2562 void* dBuffer3 = NULL; 2563 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2564 size_t bufferSize1 = 0; 2565 size_t bufferSize2 = 0; 2566 size_t bufferSize3 = 0; 2567 size_t bufferSize4 = 0; 2568 size_t bufferSize5 = 0; 2569 2570 /*----------------------------------------------------------------------*/ 2571 /* ask bufferSize1 bytes for external memory */ 2572 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2573 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2574 &bufferSize1, NULL);CHKERRCUSPARSE(stat); 2575 cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr); 2576 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2577 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2578 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2579 &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat); 2580 2581 /*----------------------------------------------------------------------*/ 2582 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2583 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2584 &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat); 2585 cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr); 2586 cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr); 2587 cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr); 2588 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2589 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2590 &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat); 2591 cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr); 2592 cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr); 2593 2594 /*----------------------------------------------------------------------*/ 2595 /* get matrix C non-zero entries C_nnz1 */ 2596 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2597 c->nz = (PetscInt) C_nnz1; 2598 /* allocate matrix C */ 2599 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2600 Ccsr->values = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2601 /* update matC with the new pointers */ 2602 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2603 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2604 2605 /*----------------------------------------------------------------------*/ 2606 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2607 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2608 &bufferSize5, NULL);CHKERRCUSPARSE(stat); 2609 cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr); 2610 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2611 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2612 &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat); 2613 cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr); 2614 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2615 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2616 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2617 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2618 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr); 2619 } 2620 #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2621 size_t bufSize2; 2622 /* ask bufferSize bytes for external memory */ 2623 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2624 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2625 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2626 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2627 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2628 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2629 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2630 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2631 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2632 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2633 /* ask bufferSize again bytes for external memory */ 2634 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2635 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2636 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2637 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2638 /* The CUSPARSE documentation is not clear, nor the API 2639 We need both buffers to perform the operations properly! 2640 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2641 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2642 is stored in the descriptor! What a messy API... */ 2643 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2644 /* compute the intermediate product of A * B */ 2645 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2646 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2647 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2648 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2649 /* get matrix C non-zero entries C_nnz1 */ 2650 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2651 c->nz = (PetscInt) C_nnz1; 2652 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2653 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2654 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2655 Ccsr->values = new THRUSTARRAY(c->nz); 2656 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2657 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2658 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2659 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2660 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2661 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2662 #endif 2663 #else 2664 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2665 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2666 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2667 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2668 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2669 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2670 c->nz = cnz; 2671 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2672 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2673 Ccsr->values = new THRUSTARRAY(c->nz); 2674 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2675 2676 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2677 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2678 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2679 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2680 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2681 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2682 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2683 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2684 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2685 #endif 2686 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2687 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2688 finalizesym: 2689 c->singlemalloc = PETSC_FALSE; 2690 c->free_a = PETSC_TRUE; 2691 c->free_ij = PETSC_TRUE; 2692 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2693 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2694 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2695 PetscInt *d_i = c->i; 2696 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2697 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2698 ii = *Ccsr->row_offsets; 2699 jj = *Ccsr->column_indices; 2700 if (ciscompressed) d_i = c->compressedrow.i; 2701 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2702 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2703 } else { 2704 PetscInt *d_i = c->i; 2705 if (ciscompressed) d_i = c->compressedrow.i; 2706 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2707 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2708 } 2709 if (ciscompressed) { /* need to expand host row offsets */ 2710 PetscInt r = 0; 2711 c->i[0] = 0; 2712 for (k = 0; k < c->compressedrow.nrows; k++) { 2713 const PetscInt next = c->compressedrow.rindex[k]; 2714 const PetscInt old = c->compressedrow.i[k]; 2715 for (; r < next; r++) c->i[r+1] = old; 2716 } 2717 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2718 } 2719 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2720 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2721 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2722 c->maxnz = c->nz; 2723 c->nonzerorowcnt = 0; 2724 c->rmax = 0; 2725 for (k = 0; k < m; k++) { 2726 const PetscInt nn = c->i[k+1] - c->i[k]; 2727 c->ilen[k] = c->imax[k] = nn; 2728 c->nonzerorowcnt += (PetscInt)!!nn; 2729 c->rmax = PetscMax(c->rmax,nn); 2730 } 2731 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2732 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2733 Ccsr->num_entries = c->nz; 2734 2735 C->nonzerostate++; 2736 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2737 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2738 Ccusp->nonzerostate = C->nonzerostate; 2739 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2740 C->preallocated = PETSC_TRUE; 2741 C->assembled = PETSC_FALSE; 2742 C->was_assembled = PETSC_FALSE; 2743 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2744 mmdata->reusesym = PETSC_TRUE; 2745 C->offloadmask = PETSC_OFFLOAD_GPU; 2746 } 2747 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2748 PetscFunctionReturn(0); 2749 } 2750 2751 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2752 2753 /* handles sparse or dense B */ 2754 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2755 { 2756 Mat_Product *product = mat->product; 2757 PetscErrorCode ierr; 2758 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2759 2760 PetscFunctionBegin; 2761 MatCheckProduct(mat,1); 2762 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2763 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2764 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2765 } 2766 if (product->type == MATPRODUCT_ABC) { 2767 Ciscusp = PETSC_FALSE; 2768 if (!product->C->boundtocpu) { 2769 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2770 } 2771 } 2772 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 2773 PetscBool usecpu = PETSC_FALSE; 2774 switch (product->type) { 2775 case MATPRODUCT_AB: 2776 if (product->api_user) { 2777 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 2778 ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2779 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2780 } else { 2781 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 2782 ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2783 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2784 } 2785 break; 2786 case MATPRODUCT_AtB: 2787 if (product->api_user) { 2788 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 2789 ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2790 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2791 } else { 2792 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 2793 ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2794 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2795 } 2796 break; 2797 case MATPRODUCT_PtAP: 2798 if (product->api_user) { 2799 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 2800 ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2801 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2802 } else { 2803 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 2804 ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2805 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2806 } 2807 break; 2808 case MATPRODUCT_RARt: 2809 if (product->api_user) { 2810 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 2811 ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2812 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2813 } else { 2814 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 2815 ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2816 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2817 } 2818 break; 2819 case MATPRODUCT_ABC: 2820 if (product->api_user) { 2821 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 2822 ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2823 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2824 } else { 2825 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 2826 ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2827 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2828 } 2829 break; 2830 default: 2831 break; 2832 } 2833 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 2834 } 2835 /* dispatch */ 2836 if (isdense) { 2837 switch (product->type) { 2838 case MATPRODUCT_AB: 2839 case MATPRODUCT_AtB: 2840 case MATPRODUCT_ABt: 2841 case MATPRODUCT_PtAP: 2842 case MATPRODUCT_RARt: 2843 if (product->A->boundtocpu) { 2844 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2845 } else { 2846 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2847 } 2848 break; 2849 case MATPRODUCT_ABC: 2850 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2851 break; 2852 default: 2853 break; 2854 } 2855 } else if (Biscusp && Ciscusp) { 2856 switch (product->type) { 2857 case MATPRODUCT_AB: 2858 case MATPRODUCT_AtB: 2859 case MATPRODUCT_ABt: 2860 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2861 break; 2862 case MATPRODUCT_PtAP: 2863 case MATPRODUCT_RARt: 2864 case MATPRODUCT_ABC: 2865 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2866 break; 2867 default: 2868 break; 2869 } 2870 } else { /* fallback for AIJ */ 2871 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2872 } 2873 PetscFunctionReturn(0); 2874 } 2875 2876 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2877 { 2878 PetscErrorCode ierr; 2879 2880 PetscFunctionBegin; 2881 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2882 PetscFunctionReturn(0); 2883 } 2884 2885 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2886 { 2887 PetscErrorCode ierr; 2888 2889 PetscFunctionBegin; 2890 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2891 PetscFunctionReturn(0); 2892 } 2893 2894 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2895 { 2896 PetscErrorCode ierr; 2897 2898 PetscFunctionBegin; 2899 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2900 PetscFunctionReturn(0); 2901 } 2902 2903 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2904 { 2905 PetscErrorCode ierr; 2906 2907 PetscFunctionBegin; 2908 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2909 PetscFunctionReturn(0); 2910 } 2911 2912 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2913 { 2914 PetscErrorCode ierr; 2915 2916 PetscFunctionBegin; 2917 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2918 PetscFunctionReturn(0); 2919 } 2920 2921 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2922 { 2923 int i = blockIdx.x*blockDim.x + threadIdx.x; 2924 if (i < n) y[idx[i]] += x[i]; 2925 } 2926 2927 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2928 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2929 { 2930 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2931 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2932 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2933 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2934 PetscErrorCode ierr; 2935 cusparseStatus_t stat; 2936 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2937 PetscBool compressed; 2938 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2939 PetscInt nx,ny; 2940 #endif 2941 2942 PetscFunctionBegin; 2943 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 2944 if (!a->nonzerorowcnt) { 2945 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2946 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2947 PetscFunctionReturn(0); 2948 } 2949 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2950 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2951 if (!trans) { 2952 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2953 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2954 } else { 2955 if (herm || !A->form_explicit_transpose) { 2956 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2957 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2958 } else { 2959 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);} 2960 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2961 } 2962 } 2963 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2964 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2965 2966 try { 2967 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2968 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2969 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2970 2971 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2972 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2973 /* z = A x + beta y. 2974 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2975 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2976 */ 2977 xptr = xarray; 2978 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2979 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2980 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2981 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2982 allocated to accommodate different uses. So we get the length info directly from mat. 2983 */ 2984 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2985 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2986 nx = mat->num_cols; 2987 ny = mat->num_rows; 2988 } 2989 #endif 2990 } else { 2991 /* z = A^T x + beta y 2992 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2993 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2994 */ 2995 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2996 dptr = zarray; 2997 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2998 if (compressed) { /* Scatter x to work vector */ 2999 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3000 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3001 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3002 VecCUDAEqualsReverse()); 3003 } 3004 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3005 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3006 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3007 nx = mat->num_rows; 3008 ny = mat->num_cols; 3009 } 3010 #endif 3011 } 3012 3013 /* csr_spmv does y = alpha op(A) x + beta y */ 3014 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3015 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3016 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3017 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3018 cudaError_t cerr; 3019 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3020 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3021 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3022 matstruct->matDescr, 3023 matstruct->cuSpMV[opA].vecXDescr, beta, 3024 matstruct->cuSpMV[opA].vecYDescr, 3025 cusparse_scalartype, 3026 cusparsestruct->spmvAlg, 3027 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 3028 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 3029 3030 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3031 } else { 3032 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3033 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 3034 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 3035 } 3036 3037 stat = cusparseSpMV(cusparsestruct->handle, opA, 3038 matstruct->alpha_one, 3039 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */ 3040 matstruct->cuSpMV[opA].vecXDescr, 3041 beta, 3042 matstruct->cuSpMV[opA].vecYDescr, 3043 cusparse_scalartype, 3044 cusparsestruct->spmvAlg, 3045 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 3046 #else 3047 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3048 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 3049 mat->num_rows, mat->num_cols, 3050 mat->num_entries, matstruct->alpha_one, matstruct->descr, 3051 mat->values->data().get(), mat->row_offsets->data().get(), 3052 mat->column_indices->data().get(), xptr, beta, 3053 dptr);CHKERRCUSPARSE(stat); 3054 #endif 3055 } else { 3056 if (cusparsestruct->nrows) { 3057 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3058 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3059 #else 3060 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3061 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 3062 matstruct->alpha_one, matstruct->descr, hybMat, 3063 xptr, beta, 3064 dptr);CHKERRCUSPARSE(stat); 3065 #endif 3066 } 3067 } 3068 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3069 3070 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3071 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3072 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3073 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 3074 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3075 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3076 } 3077 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3078 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 3079 } 3080 3081 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3082 if (compressed) { 3083 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3084 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3085 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3086 prevent that. So I just add a ScatterAdd kernel. 3087 */ 3088 #if 0 3089 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3090 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3091 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3092 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3093 VecCUDAPlusEquals()); 3094 #else 3095 PetscInt n = matstruct->cprowIndices->size(); 3096 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3097 #endif 3098 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3099 } 3100 } else { 3101 if (yy && yy != zz) { 3102 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3103 } 3104 } 3105 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3106 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 3107 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 3108 } catch(char *ex) { 3109 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3110 } 3111 if (yy) { 3112 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 3113 } else { 3114 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 3115 } 3116 PetscFunctionReturn(0); 3117 } 3118 3119 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3120 { 3121 PetscErrorCode ierr; 3122 3123 PetscFunctionBegin; 3124 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3125 PetscFunctionReturn(0); 3126 } 3127 3128 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3129 { 3130 PetscErrorCode ierr; 3131 PetscObjectState onnz = A->nonzerostate; 3132 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3133 3134 PetscFunctionBegin; 3135 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 3136 if (onnz != A->nonzerostate && cusp->deviceMat) { 3137 cudaError_t cerr; 3138 3139 ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 3140 cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr); 3141 cusp->deviceMat = NULL; 3142 } 3143 PetscFunctionReturn(0); 3144 } 3145 3146 /* --------------------------------------------------------------------------------*/ 3147 /*@ 3148 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3149 (the default parallel PETSc format). This matrix will ultimately pushed down 3150 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3151 assembly performance the user should preallocate the matrix storage by setting 3152 the parameter nz (or the array nnz). By setting these parameters accurately, 3153 performance during matrix assembly can be increased by more than a factor of 50. 3154 3155 Collective 3156 3157 Input Parameters: 3158 + comm - MPI communicator, set to PETSC_COMM_SELF 3159 . m - number of rows 3160 . n - number of columns 3161 . nz - number of nonzeros per row (same for all rows) 3162 - nnz - array containing the number of nonzeros in the various rows 3163 (possibly different for each row) or NULL 3164 3165 Output Parameter: 3166 . A - the matrix 3167 3168 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3169 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3170 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3171 3172 Notes: 3173 If nnz is given then nz is ignored 3174 3175 The AIJ format (also called the Yale sparse matrix format or 3176 compressed row storage), is fully compatible with standard Fortran 77 3177 storage. That is, the stored row and column indices can begin at 3178 either one (as in Fortran) or zero. See the users' manual for details. 3179 3180 Specify the preallocated storage with either nz or nnz (not both). 3181 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3182 allocation. For large problems you MUST preallocate memory or you 3183 will get TERRIBLE performance, see the users' manual chapter on matrices. 3184 3185 By default, this format uses inodes (identical nodes) when possible, to 3186 improve numerical efficiency of matrix-vector products and solves. We 3187 search for consecutive rows with the same nonzero structure, thereby 3188 reusing matrix information to achieve increased efficiency. 3189 3190 Level: intermediate 3191 3192 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3193 @*/ 3194 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3195 { 3196 PetscErrorCode ierr; 3197 3198 PetscFunctionBegin; 3199 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3200 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3201 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3202 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3203 PetscFunctionReturn(0); 3204 } 3205 3206 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3207 { 3208 PetscErrorCode ierr; 3209 3210 PetscFunctionBegin; 3211 if (A->factortype == MAT_FACTOR_NONE) { 3212 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3213 } else { 3214 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3215 } 3216 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3217 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3218 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3219 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3220 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3221 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3222 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3223 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3224 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3225 PetscFunctionReturn(0); 3226 } 3227 3228 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3229 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3230 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3231 { 3232 PetscErrorCode ierr; 3233 3234 PetscFunctionBegin; 3235 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3236 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3237 PetscFunctionReturn(0); 3238 } 3239 3240 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3241 { 3242 PetscErrorCode ierr; 3243 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3244 Mat_SeqAIJCUSPARSE *cy; 3245 Mat_SeqAIJCUSPARSE *cx; 3246 PetscScalar *ay; 3247 const PetscScalar *ax; 3248 CsrMatrix *csry,*csrx; 3249 3250 PetscFunctionBegin; 3251 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3252 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3253 if (X->ops->axpy != Y->ops->axpy) { 3254 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3255 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3256 PetscFunctionReturn(0); 3257 } 3258 /* if we are here, it means both matrices are bound to GPU */ 3259 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3260 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3261 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3262 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3263 csry = (CsrMatrix*)cy->mat->mat; 3264 csrx = (CsrMatrix*)cx->mat->mat; 3265 /* see if we can turn this into a cublas axpy */ 3266 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3267 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3268 if (eq) { 3269 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3270 } 3271 if (eq) str = SAME_NONZERO_PATTERN; 3272 } 3273 /* spgeam is buggy with one column */ 3274 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3275 3276 if (str == SUBSET_NONZERO_PATTERN) { 3277 cusparseStatus_t stat; 3278 PetscScalar b = 1.0; 3279 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3280 size_t bufferSize; 3281 void *buffer; 3282 cudaError_t cerr; 3283 #endif 3284 3285 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3286 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3287 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3288 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3289 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3290 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3291 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3292 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3293 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3294 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3295 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3296 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3297 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3298 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3299 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3300 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3301 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3302 #else 3303 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3304 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3305 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3306 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3307 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3308 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3309 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3310 #endif 3311 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3312 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3313 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3314 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3315 } else if (str == SAME_NONZERO_PATTERN) { 3316 cublasHandle_t cublasv2handle; 3317 cublasStatus_t berr; 3318 PetscBLASInt one = 1, bnz = 1; 3319 3320 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3321 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3322 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3323 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3324 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3325 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3326 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3327 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3328 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3329 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3330 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3331 } else { 3332 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3333 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3334 } 3335 PetscFunctionReturn(0); 3336 } 3337 3338 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3339 { 3340 PetscErrorCode ierr; 3341 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3342 PetscScalar *ay; 3343 cublasHandle_t cublasv2handle; 3344 cublasStatus_t berr; 3345 PetscBLASInt one = 1, bnz = 1; 3346 3347 PetscFunctionBegin; 3348 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3349 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3350 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3351 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3352 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3353 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3354 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3355 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3356 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3357 PetscFunctionReturn(0); 3358 } 3359 3360 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3361 { 3362 PetscErrorCode ierr; 3363 PetscBool both = PETSC_FALSE; 3364 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3365 3366 PetscFunctionBegin; 3367 if (A->factortype == MAT_FACTOR_NONE) { 3368 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3369 if (spptr->mat) { 3370 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3371 if (matrix->values) { 3372 both = PETSC_TRUE; 3373 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3374 } 3375 } 3376 if (spptr->matTranspose) { 3377 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3378 if (matrix->values) { 3379 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3380 } 3381 } 3382 } 3383 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3384 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3385 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3386 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3387 else A->offloadmask = PETSC_OFFLOAD_CPU; 3388 PetscFunctionReturn(0); 3389 } 3390 3391 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3392 { 3393 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3394 PetscErrorCode ierr; 3395 3396 PetscFunctionBegin; 3397 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3398 if (flg) { 3399 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3400 3401 A->ops->scale = MatScale_SeqAIJ; 3402 A->ops->axpy = MatAXPY_SeqAIJ; 3403 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3404 A->ops->mult = MatMult_SeqAIJ; 3405 A->ops->multadd = MatMultAdd_SeqAIJ; 3406 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3407 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3408 A->ops->multhermitiantranspose = NULL; 3409 A->ops->multhermitiantransposeadd = NULL; 3410 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3411 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3412 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3413 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3414 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3415 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3416 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3417 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3418 } else { 3419 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3420 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3421 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3422 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3423 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3424 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3425 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3426 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3427 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3428 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3429 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3430 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3431 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3432 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3433 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3434 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3435 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3436 } 3437 A->boundtocpu = flg; 3438 a->inode.use = flg; 3439 PetscFunctionReturn(0); 3440 } 3441 3442 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3443 { 3444 PetscErrorCode ierr; 3445 cusparseStatus_t stat; 3446 Mat B; 3447 3448 PetscFunctionBegin; 3449 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3450 if (reuse == MAT_INITIAL_MATRIX) { 3451 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3452 } else if (reuse == MAT_REUSE_MATRIX) { 3453 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3454 } 3455 B = *newmat; 3456 3457 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3458 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3459 3460 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3461 if (B->factortype == MAT_FACTOR_NONE) { 3462 Mat_SeqAIJCUSPARSE *spptr; 3463 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3464 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3465 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3466 spptr->format = MAT_CUSPARSE_CSR; 3467 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3468 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3469 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3470 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3471 #endif 3472 B->spptr = spptr; 3473 } else { 3474 Mat_SeqAIJCUSPARSETriFactors *spptr; 3475 3476 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3477 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3478 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3479 B->spptr = spptr; 3480 } 3481 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3482 } 3483 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3484 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3485 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3486 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3487 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3488 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3489 3490 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3491 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3492 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3493 PetscFunctionReturn(0); 3494 } 3495 3496 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3497 { 3498 PetscErrorCode ierr; 3499 3500 PetscFunctionBegin; 3501 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3502 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3503 PetscFunctionReturn(0); 3504 } 3505 3506 /*MC 3507 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3508 3509 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3510 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3511 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3512 3513 Options Database Keys: 3514 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3515 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3516 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3517 3518 Level: beginner 3519 3520 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3521 M*/ 3522 3523 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3524 3525 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3526 { 3527 PetscErrorCode ierr; 3528 3529 PetscFunctionBegin; 3530 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 3531 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3532 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3533 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3534 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3535 3536 PetscFunctionReturn(0); 3537 } 3538 3539 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3540 { 3541 PetscErrorCode ierr; 3542 cusparseStatus_t stat; 3543 3544 PetscFunctionBegin; 3545 if (*cusparsestruct) { 3546 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3547 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3548 delete (*cusparsestruct)->workVector; 3549 delete (*cusparsestruct)->rowoffsets_gpu; 3550 delete (*cusparsestruct)->cooPerm; 3551 delete (*cusparsestruct)->cooPerm_a; 3552 delete (*cusparsestruct)->csr2csc_i; 3553 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3554 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3555 } 3556 PetscFunctionReturn(0); 3557 } 3558 3559 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3560 { 3561 PetscFunctionBegin; 3562 if (*mat) { 3563 delete (*mat)->values; 3564 delete (*mat)->column_indices; 3565 delete (*mat)->row_offsets; 3566 delete *mat; 3567 *mat = 0; 3568 } 3569 PetscFunctionReturn(0); 3570 } 3571 3572 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3573 { 3574 cusparseStatus_t stat; 3575 PetscErrorCode ierr; 3576 3577 PetscFunctionBegin; 3578 if (*trifactor) { 3579 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3580 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3581 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3582 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3583 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3584 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3585 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3586 #endif 3587 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3588 } 3589 PetscFunctionReturn(0); 3590 } 3591 3592 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3593 { 3594 CsrMatrix *mat; 3595 cusparseStatus_t stat; 3596 cudaError_t err; 3597 3598 PetscFunctionBegin; 3599 if (*matstruct) { 3600 if ((*matstruct)->mat) { 3601 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3602 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3603 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3604 #else 3605 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3606 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3607 #endif 3608 } else { 3609 mat = (CsrMatrix*)(*matstruct)->mat; 3610 CsrMatrix_Destroy(&mat); 3611 } 3612 } 3613 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3614 delete (*matstruct)->cprowIndices; 3615 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3616 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3617 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3618 3619 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3620 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3621 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3622 for (int i=0; i<3; i++) { 3623 if (mdata->cuSpMV[i].initialized) { 3624 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3625 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3626 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3627 } 3628 } 3629 #endif 3630 delete *matstruct; 3631 *matstruct = NULL; 3632 } 3633 PetscFunctionReturn(0); 3634 } 3635 3636 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3637 { 3638 PetscErrorCode ierr; 3639 3640 PetscFunctionBegin; 3641 if (*trifactors) { 3642 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3643 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3644 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3645 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3646 delete (*trifactors)->rpermIndices; 3647 delete (*trifactors)->cpermIndices; 3648 delete (*trifactors)->workVector; 3649 (*trifactors)->rpermIndices = NULL; 3650 (*trifactors)->cpermIndices = NULL; 3651 (*trifactors)->workVector = NULL; 3652 if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3653 if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3654 (*trifactors)->init_dev_prop = PETSC_FALSE; 3655 } 3656 PetscFunctionReturn(0); 3657 } 3658 3659 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3660 { 3661 PetscErrorCode ierr; 3662 cusparseHandle_t handle; 3663 cusparseStatus_t stat; 3664 3665 PetscFunctionBegin; 3666 if (*trifactors) { 3667 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3668 if (handle = (*trifactors)->handle) { 3669 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3670 } 3671 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3672 } 3673 PetscFunctionReturn(0); 3674 } 3675 3676 struct IJCompare 3677 { 3678 __host__ __device__ 3679 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3680 { 3681 if (t1.get<0>() < t2.get<0>()) return true; 3682 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3683 return false; 3684 } 3685 }; 3686 3687 struct IJEqual 3688 { 3689 __host__ __device__ 3690 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3691 { 3692 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3693 return true; 3694 } 3695 }; 3696 3697 struct IJDiff 3698 { 3699 __host__ __device__ 3700 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3701 { 3702 return t1 == t2 ? 0 : 1; 3703 } 3704 }; 3705 3706 struct IJSum 3707 { 3708 __host__ __device__ 3709 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3710 { 3711 return t1||t2; 3712 } 3713 }; 3714 3715 #include <thrust/iterator/discard_iterator.h> 3716 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3717 { 3718 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3719 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3720 THRUSTARRAY *cooPerm_v = NULL; 3721 thrust::device_ptr<const PetscScalar> d_v; 3722 CsrMatrix *matrix; 3723 PetscErrorCode ierr; 3724 PetscInt n; 3725 3726 PetscFunctionBegin; 3727 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3728 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3729 if (!cusp->cooPerm) { 3730 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3731 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3732 PetscFunctionReturn(0); 3733 } 3734 matrix = (CsrMatrix*)cusp->mat->mat; 3735 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3736 if (!v) { 3737 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3738 goto finalize; 3739 } 3740 n = cusp->cooPerm->size(); 3741 if (isCudaMem(v)) { 3742 d_v = thrust::device_pointer_cast(v); 3743 } else { 3744 cooPerm_v = new THRUSTARRAY(n); 3745 cooPerm_v->assign(v,v+n); 3746 d_v = cooPerm_v->data(); 3747 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3748 } 3749 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3750 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3751 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3752 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3753 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3754 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3755 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3756 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3757 */ 3758 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3759 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3760 delete cooPerm_w; 3761 } else { 3762 /* all nonzeros in d_v[] are unique entries */ 3763 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3764 matrix->values->begin())); 3765 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3766 matrix->values->end())); 3767 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 3768 } 3769 } else { 3770 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3771 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3772 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3773 } else { 3774 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3775 matrix->values->begin())); 3776 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3777 matrix->values->end())); 3778 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3779 } 3780 } 3781 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3782 finalize: 3783 delete cooPerm_v; 3784 A->offloadmask = PETSC_OFFLOAD_GPU; 3785 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3786 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3787 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3788 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3789 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3790 a->reallocs = 0; 3791 A->info.mallocs += 0; 3792 A->info.nz_unneeded = 0; 3793 A->assembled = A->was_assembled = PETSC_TRUE; 3794 A->num_ass++; 3795 PetscFunctionReturn(0); 3796 } 3797 3798 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3799 { 3800 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3801 PetscErrorCode ierr; 3802 3803 PetscFunctionBegin; 3804 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3805 if (!cusp) PetscFunctionReturn(0); 3806 if (destroy) { 3807 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3808 delete cusp->csr2csc_i; 3809 cusp->csr2csc_i = NULL; 3810 } 3811 A->transupdated = PETSC_FALSE; 3812 PetscFunctionReturn(0); 3813 } 3814 3815 #include <thrust/binary_search.h> 3816 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3817 { 3818 PetscErrorCode ierr; 3819 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3820 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3821 PetscInt cooPerm_n, nzr = 0; 3822 cudaError_t cerr; 3823 3824 PetscFunctionBegin; 3825 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3826 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3827 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3828 if (n != cooPerm_n) { 3829 delete cusp->cooPerm; 3830 delete cusp->cooPerm_a; 3831 cusp->cooPerm = NULL; 3832 cusp->cooPerm_a = NULL; 3833 } 3834 if (n) { 3835 THRUSTINTARRAY d_i(n); 3836 THRUSTINTARRAY d_j(n); 3837 THRUSTINTARRAY ii(A->rmap->n); 3838 3839 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3840 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3841 3842 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3843 d_i.assign(coo_i,coo_i+n); 3844 d_j.assign(coo_j,coo_j+n); 3845 3846 /* Ex. 3847 n = 6 3848 coo_i = [3,3,1,4,1,4] 3849 coo_j = [3,2,2,5,2,6] 3850 */ 3851 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3852 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3853 3854 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3855 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3856 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 3857 *cusp->cooPerm_a = d_i; /* copy the sorted array */ 3858 THRUSTINTARRAY w = d_j; 3859 3860 /* 3861 d_i = [1,1,3,3,4,4] 3862 d_j = [2,2,2,3,5,6] 3863 cooPerm = [2,4,1,0,3,5] 3864 */ 3865 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 3866 3867 /* 3868 d_i = [1,3,3,4,4,x] 3869 ^ekey 3870 d_j = [2,2,3,5,6,x] 3871 ^nekye 3872 */ 3873 if (nekey == ekey) { /* all entries are unique */ 3874 delete cusp->cooPerm_a; 3875 cusp->cooPerm_a = NULL; 3876 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 3877 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 3878 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 3879 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 3880 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 3881 w[0] = 0; 3882 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 3883 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 3884 } 3885 thrust::counting_iterator<PetscInt> search_begin(0); 3886 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 3887 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 3888 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 3889 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3890 3891 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3892 a->singlemalloc = PETSC_FALSE; 3893 a->free_a = PETSC_TRUE; 3894 a->free_ij = PETSC_TRUE; 3895 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3896 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 3897 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3898 a->nz = a->maxnz = a->i[A->rmap->n]; 3899 a->rmax = 0; 3900 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3901 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3902 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3903 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3904 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3905 for (PetscInt i = 0; i < A->rmap->n; i++) { 3906 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3907 nzr += (PetscInt)!!(nnzr); 3908 a->ilen[i] = a->imax[i] = nnzr; 3909 a->rmax = PetscMax(a->rmax,nnzr); 3910 } 3911 a->nonzerorowcnt = nzr; 3912 A->preallocated = PETSC_TRUE; 3913 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3914 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3915 } else { 3916 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3917 } 3918 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3919 3920 /* We want to allocate the CUSPARSE struct for matvec now. 3921 The code is so convoluted now that I prefer to copy zeros */ 3922 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3923 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3924 A->offloadmask = PETSC_OFFLOAD_CPU; 3925 A->nonzerostate++; 3926 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3927 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 3928 3929 A->assembled = PETSC_FALSE; 3930 A->was_assembled = PETSC_FALSE; 3931 PetscFunctionReturn(0); 3932 } 3933 3934 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 3935 { 3936 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3937 CsrMatrix *csr; 3938 PetscErrorCode ierr; 3939 3940 PetscFunctionBegin; 3941 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3942 PetscValidPointer(a,2); 3943 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3944 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3945 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3946 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3947 csr = (CsrMatrix*)cusp->mat->mat; 3948 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3949 *a = csr->values->data().get(); 3950 PetscFunctionReturn(0); 3951 } 3952 3953 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 3954 { 3955 PetscFunctionBegin; 3956 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3957 PetscValidPointer(a,2); 3958 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3959 *a = NULL; 3960 PetscFunctionReturn(0); 3961 } 3962 3963 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 3964 { 3965 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3966 CsrMatrix *csr; 3967 PetscErrorCode ierr; 3968 3969 PetscFunctionBegin; 3970 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3971 PetscValidPointer(a,2); 3972 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3973 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3974 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3975 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3976 csr = (CsrMatrix*)cusp->mat->mat; 3977 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3978 *a = csr->values->data().get(); 3979 A->offloadmask = PETSC_OFFLOAD_GPU; 3980 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3981 PetscFunctionReturn(0); 3982 } 3983 3984 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 3985 { 3986 PetscErrorCode ierr; 3987 3988 PetscFunctionBegin; 3989 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3990 PetscValidPointer(a,2); 3991 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3992 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3993 *a = NULL; 3994 PetscFunctionReturn(0); 3995 } 3996 3997 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 3998 { 3999 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4000 CsrMatrix *csr; 4001 PetscErrorCode ierr; 4002 4003 PetscFunctionBegin; 4004 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4005 PetscValidPointer(a,2); 4006 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4007 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4008 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4009 csr = (CsrMatrix*)cusp->mat->mat; 4010 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4011 *a = csr->values->data().get(); 4012 A->offloadmask = PETSC_OFFLOAD_GPU; 4013 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4014 PetscFunctionReturn(0); 4015 } 4016 4017 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4018 { 4019 PetscErrorCode ierr; 4020 4021 PetscFunctionBegin; 4022 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4023 PetscValidPointer(a,2); 4024 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4025 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4026 *a = NULL; 4027 PetscFunctionReturn(0); 4028 } 4029 4030 struct IJCompare4 4031 { 4032 __host__ __device__ 4033 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4034 { 4035 if (t1.get<0>() < t2.get<0>()) return true; 4036 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4037 return false; 4038 } 4039 }; 4040 4041 struct Shift 4042 { 4043 int _shift; 4044 4045 Shift(int shift) : _shift(shift) {} 4046 __host__ __device__ 4047 inline int operator() (const int &c) 4048 { 4049 return c + _shift; 4050 } 4051 }; 4052 4053 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4054 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4055 { 4056 PetscErrorCode ierr; 4057 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4058 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4059 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4060 CsrMatrix *Acsr,*Bcsr,*Ccsr; 4061 PetscInt Annz,Bnnz; 4062 cusparseStatus_t stat; 4063 PetscInt i,m,n,zero = 0; 4064 cudaError_t cerr; 4065 4066 PetscFunctionBegin; 4067 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4068 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4069 PetscValidPointer(C,4); 4070 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4071 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 4072 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 4073 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 4074 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4075 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4076 if (reuse == MAT_INITIAL_MATRIX) { 4077 m = A->rmap->n; 4078 n = A->cmap->n + B->cmap->n; 4079 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 4080 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 4081 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 4082 c = (Mat_SeqAIJ*)(*C)->data; 4083 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4084 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4085 Ccsr = new CsrMatrix; 4086 Cmat->cprowIndices = NULL; 4087 c->compressedrow.use = PETSC_FALSE; 4088 c->compressedrow.nrows = 0; 4089 c->compressedrow.i = NULL; 4090 c->compressedrow.rindex = NULL; 4091 Ccusp->workVector = NULL; 4092 Ccusp->nrows = m; 4093 Ccusp->mat = Cmat; 4094 Ccusp->mat->mat = Ccsr; 4095 Ccsr->num_rows = m; 4096 Ccsr->num_cols = n; 4097 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 4098 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4099 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4100 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4101 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4102 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4103 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4104 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4105 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4106 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4107 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4108 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 4109 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 4110 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4111 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4112 4113 Acsr = (CsrMatrix*)Acusp->mat->mat; 4114 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4115 Annz = (PetscInt)Acsr->column_indices->size(); 4116 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4117 c->nz = Annz + Bnnz; 4118 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4119 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4120 Ccsr->values = new THRUSTARRAY(c->nz); 4121 Ccsr->num_entries = c->nz; 4122 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4123 if (c->nz) { 4124 auto Acoo = new THRUSTINTARRAY32(Annz); 4125 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4126 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4127 THRUSTINTARRAY32 *Aroff,*Broff; 4128 4129 if (a->compressedrow.use) { /* need full row offset */ 4130 if (!Acusp->rowoffsets_gpu) { 4131 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4132 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4133 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4134 } 4135 Aroff = Acusp->rowoffsets_gpu; 4136 } else Aroff = Acsr->row_offsets; 4137 if (b->compressedrow.use) { /* need full row offset */ 4138 if (!Bcusp->rowoffsets_gpu) { 4139 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4140 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4141 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4142 } 4143 Broff = Bcusp->rowoffsets_gpu; 4144 } else Broff = Bcsr->row_offsets; 4145 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4146 stat = cusparseXcsr2coo(Acusp->handle, 4147 Aroff->data().get(), 4148 Annz, 4149 m, 4150 Acoo->data().get(), 4151 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4152 stat = cusparseXcsr2coo(Bcusp->handle, 4153 Broff->data().get(), 4154 Bnnz, 4155 m, 4156 Bcoo->data().get(), 4157 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4158 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4159 auto Aperm = thrust::make_constant_iterator(1); 4160 auto Bperm = thrust::make_constant_iterator(0); 4161 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4162 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4163 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4164 #else 4165 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4166 auto Bcib = Bcsr->column_indices->begin(); 4167 auto Bcie = Bcsr->column_indices->end(); 4168 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4169 #endif 4170 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4171 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4172 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4173 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4174 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4175 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4176 auto p1 = Ccusp->cooPerm->begin(); 4177 auto p2 = Ccusp->cooPerm->begin(); 4178 thrust::advance(p2,Annz); 4179 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4180 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4181 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4182 #endif 4183 auto cci = thrust::make_counting_iterator(zero); 4184 auto cce = thrust::make_counting_iterator(c->nz); 4185 #if 0 //Errors on SUMMIT cuda 11.1.0 4186 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4187 #else 4188 auto pred = thrust::identity<int>(); 4189 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4190 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4191 #endif 4192 stat = cusparseXcoo2csr(Ccusp->handle, 4193 Ccoo->data().get(), 4194 c->nz, 4195 m, 4196 Ccsr->row_offsets->data().get(), 4197 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4198 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4199 delete wPerm; 4200 delete Acoo; 4201 delete Bcoo; 4202 delete Ccoo; 4203 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4204 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4205 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4206 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4207 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4208 #endif 4209 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4210 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4211 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4212 CsrMatrix *CcsrT = new CsrMatrix; 4213 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4214 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4215 4216 (*C)->form_explicit_transpose = PETSC_TRUE; 4217 (*C)->transupdated = PETSC_TRUE; 4218 Ccusp->rowoffsets_gpu = NULL; 4219 CmatT->cprowIndices = NULL; 4220 CmatT->mat = CcsrT; 4221 CcsrT->num_rows = n; 4222 CcsrT->num_cols = m; 4223 CcsrT->num_entries = c->nz; 4224 4225 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4226 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4227 CcsrT->values = new THRUSTARRAY(c->nz); 4228 4229 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4230 auto rT = CcsrT->row_offsets->begin(); 4231 if (AT) { 4232 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4233 thrust::advance(rT,-1); 4234 } 4235 if (BT) { 4236 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4237 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4238 thrust::copy(titb,tite,rT); 4239 } 4240 auto cT = CcsrT->column_indices->begin(); 4241 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4242 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4243 auto vT = CcsrT->values->begin(); 4244 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4245 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4246 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4247 4248 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4249 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4250 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4251 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4252 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4253 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4254 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4255 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4256 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4257 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4258 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4259 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4260 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4261 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4262 #endif 4263 Ccusp->matTranspose = CmatT; 4264 } 4265 } 4266 4267 c->singlemalloc = PETSC_FALSE; 4268 c->free_a = PETSC_TRUE; 4269 c->free_ij = PETSC_TRUE; 4270 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4271 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4272 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4273 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4274 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4275 ii = *Ccsr->row_offsets; 4276 jj = *Ccsr->column_indices; 4277 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4278 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4279 } else { 4280 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4281 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4282 } 4283 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4284 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4285 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4286 c->maxnz = c->nz; 4287 c->nonzerorowcnt = 0; 4288 c->rmax = 0; 4289 for (i = 0; i < m; i++) { 4290 const PetscInt nn = c->i[i+1] - c->i[i]; 4291 c->ilen[i] = c->imax[i] = nn; 4292 c->nonzerorowcnt += (PetscInt)!!nn; 4293 c->rmax = PetscMax(c->rmax,nn); 4294 } 4295 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4296 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4297 (*C)->nonzerostate++; 4298 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4299 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4300 Ccusp->nonzerostate = (*C)->nonzerostate; 4301 (*C)->preallocated = PETSC_TRUE; 4302 } else { 4303 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4304 c = (Mat_SeqAIJ*)(*C)->data; 4305 if (c->nz) { 4306 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4307 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4308 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4309 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4310 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4311 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4312 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4313 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4314 Acsr = (CsrMatrix*)Acusp->mat->mat; 4315 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4316 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4317 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4318 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4319 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4320 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4321 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4322 auto pmid = Ccusp->cooPerm->begin(); 4323 thrust::advance(pmid,Acsr->num_entries); 4324 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4325 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4326 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4327 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4328 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4329 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4330 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4331 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4332 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4333 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4334 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4335 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4336 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4337 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4338 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4339 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4340 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4341 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4342 auto vT = CcsrT->values->begin(); 4343 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4344 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4345 (*C)->transupdated = PETSC_TRUE; 4346 } 4347 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4348 } 4349 } 4350 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4351 (*C)->assembled = PETSC_TRUE; 4352 (*C)->was_assembled = PETSC_FALSE; 4353 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4354 PetscFunctionReturn(0); 4355 } 4356 4357 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4358 { 4359 PetscErrorCode ierr; 4360 bool dmem; 4361 const PetscScalar *av; 4362 cudaError_t cerr; 4363 4364 PetscFunctionBegin; 4365 dmem = isCudaMem(v); 4366 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4367 if (n && idx) { 4368 THRUSTINTARRAY widx(n); 4369 widx.assign(idx,idx+n); 4370 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4371 4372 THRUSTARRAY *w = NULL; 4373 thrust::device_ptr<PetscScalar> dv; 4374 if (dmem) { 4375 dv = thrust::device_pointer_cast(v); 4376 } else { 4377 w = new THRUSTARRAY(n); 4378 dv = w->data(); 4379 } 4380 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4381 4382 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4383 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4384 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4385 if (w) { 4386 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4387 } 4388 delete w; 4389 } else { 4390 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4391 } 4392 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4393 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4394 PetscFunctionReturn(0); 4395 } 4396