1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/async/for_each.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 76 77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 82 83 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 85 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 86 87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 89 90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 91 92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 93 { 94 cusparseStatus_t stat; 95 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 96 97 PetscFunctionBegin; 98 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 99 cusparsestruct->stream = stream; 100 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 101 PetscFunctionReturn(0); 102 } 103 104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 105 { 106 cusparseStatus_t stat; 107 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 108 109 PetscFunctionBegin; 110 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 111 if (cusparsestruct->handle != handle) { 112 if (cusparsestruct->handle) { 113 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 114 } 115 cusparsestruct->handle = handle; 116 } 117 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 118 PetscFunctionReturn(0); 119 } 120 121 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 122 { 123 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 124 PetscBool flg; 125 PetscErrorCode ierr; 126 127 PetscFunctionBegin; 128 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 129 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 130 if (cusparsestruct->handle) cusparsestruct->handle = 0; 131 PetscFunctionReturn(0); 132 } 133 134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 135 { 136 PetscFunctionBegin; 137 *type = MATSOLVERCUSPARSE; 138 PetscFunctionReturn(0); 139 } 140 141 /*MC 142 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 143 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 144 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 145 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 146 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 147 algorithms are not recommended. This class does NOT support direct solver operations. 148 149 Level: beginner 150 151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 152 M*/ 153 154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 155 { 156 PetscErrorCode ierr; 157 PetscInt n = A->rmap->n; 158 159 PetscFunctionBegin; 160 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 161 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 162 (*B)->factortype = ftype; 163 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 164 165 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 166 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 167 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 168 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 169 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 170 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 171 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 172 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 173 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 174 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 175 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 176 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 177 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 178 179 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 180 (*B)->canuseordering = PETSC_TRUE; 181 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 182 PetscFunctionReturn(0); 183 } 184 185 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 186 { 187 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 188 189 PetscFunctionBegin; 190 switch (op) { 191 case MAT_CUSPARSE_MULT: 192 cusparsestruct->format = format; 193 break; 194 case MAT_CUSPARSE_ALL: 195 cusparsestruct->format = format; 196 break; 197 default: 198 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 199 } 200 PetscFunctionReturn(0); 201 } 202 203 /*@ 204 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 205 operation. Only the MatMult operation can use different GPU storage formats 206 for MPIAIJCUSPARSE matrices. 207 Not Collective 208 209 Input Parameters: 210 + A - Matrix of type SEQAIJCUSPARSE 211 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 212 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 213 214 Output Parameter: 215 216 Level: intermediate 217 218 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 219 @*/ 220 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 221 { 222 PetscErrorCode ierr; 223 224 PetscFunctionBegin; 225 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 226 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 227 PetscFunctionReturn(0); 228 } 229 230 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 231 { 232 PetscErrorCode ierr; 233 234 PetscFunctionBegin; 235 switch (op) { 236 case MAT_FORM_EXPLICIT_TRANSPOSE: 237 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 238 if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 239 A->form_explicit_transpose = flg; 240 break; 241 default: 242 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 243 break; 244 } 245 PetscFunctionReturn(0); 246 } 247 248 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 249 250 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 251 { 252 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 253 IS isrow = b->row,iscol = b->col; 254 PetscBool row_identity,col_identity; 255 PetscErrorCode ierr; 256 257 PetscFunctionBegin; 258 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 259 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 260 B->offloadmask = PETSC_OFFLOAD_CPU; 261 /* determine which version of MatSolve needs to be used. */ 262 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 263 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 264 if (row_identity && col_identity) { 265 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 266 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 267 B->ops->matsolve = NULL; 268 B->ops->matsolvetranspose = NULL; 269 } else { 270 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 271 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 272 B->ops->matsolve = NULL; 273 B->ops->matsolvetranspose = NULL; 274 } 275 276 /* get the triangular factors */ 277 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 278 PetscFunctionReturn(0); 279 } 280 281 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 282 { 283 PetscErrorCode ierr; 284 MatCUSPARSEStorageFormat format; 285 PetscBool flg; 286 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 287 288 PetscFunctionBegin; 289 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 290 if (A->factortype == MAT_FACTOR_NONE) { 291 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 292 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 293 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 294 295 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 296 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 297 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 298 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 299 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 300 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 301 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 302 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 303 if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 304 #else 305 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 306 #endif 307 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 308 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 309 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 310 311 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 312 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 313 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 314 #endif 315 } 316 ierr = PetscOptionsTail();CHKERRQ(ierr); 317 PetscFunctionReturn(0); 318 } 319 320 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 321 { 322 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 323 PetscErrorCode ierr; 324 325 PetscFunctionBegin; 326 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 327 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 328 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 329 PetscFunctionReturn(0); 330 } 331 332 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 333 { 334 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 335 PetscErrorCode ierr; 336 337 PetscFunctionBegin; 338 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 339 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 340 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 341 PetscFunctionReturn(0); 342 } 343 344 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 345 { 346 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 347 PetscErrorCode ierr; 348 349 PetscFunctionBegin; 350 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 351 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 352 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 353 PetscFunctionReturn(0); 354 } 355 356 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 357 { 358 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 363 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 364 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 365 PetscFunctionReturn(0); 366 } 367 368 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 369 { 370 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 371 PetscInt n = A->rmap->n; 372 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 373 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 374 cusparseStatus_t stat; 375 const PetscInt *ai = a->i,*aj = a->j,*vi; 376 const MatScalar *aa = a->a,*v; 377 PetscInt *AiLo, *AjLo; 378 PetscInt i,nz, nzLower, offset, rowOffset; 379 PetscErrorCode ierr; 380 cudaError_t cerr; 381 382 PetscFunctionBegin; 383 if (!n) PetscFunctionReturn(0); 384 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 385 try { 386 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 387 nzLower=n+ai[n]-ai[1]; 388 if (!loTriFactor) { 389 PetscScalar *AALo; 390 391 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 392 393 /* Allocate Space for the lower triangular matrix */ 394 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 395 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 396 397 /* Fill the lower triangular matrix */ 398 AiLo[0] = (PetscInt) 0; 399 AiLo[n] = nzLower; 400 AjLo[0] = (PetscInt) 0; 401 AALo[0] = (MatScalar) 1.0; 402 v = aa; 403 vi = aj; 404 offset = 1; 405 rowOffset= 1; 406 for (i=1; i<n; i++) { 407 nz = ai[i+1] - ai[i]; 408 /* additional 1 for the term on the diagonal */ 409 AiLo[i] = rowOffset; 410 rowOffset += nz+1; 411 412 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 413 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 414 415 offset += nz; 416 AjLo[offset] = (PetscInt) i; 417 AALo[offset] = (MatScalar) 1.0; 418 offset += 1; 419 420 v += nz; 421 vi += nz; 422 } 423 424 /* allocate space for the triangular factor information */ 425 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 426 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 427 /* Create the matrix description */ 428 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 429 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 430 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 431 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 432 #else 433 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 434 #endif 435 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 436 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 437 438 /* set the operation */ 439 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 440 441 /* set the matrix */ 442 loTriFactor->csrMat = new CsrMatrix; 443 loTriFactor->csrMat->num_rows = n; 444 loTriFactor->csrMat->num_cols = n; 445 loTriFactor->csrMat->num_entries = nzLower; 446 447 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 448 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 449 450 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 451 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 452 453 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 454 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 455 456 /* Create the solve analysis information */ 457 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 458 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 459 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 460 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 461 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 462 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 463 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 464 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 465 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 466 #endif 467 468 /* perform the solve analysis */ 469 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 470 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 471 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 472 loTriFactor->csrMat->column_indices->data().get(), 473 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 474 loTriFactor->solveInfo, 475 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 476 #else 477 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 478 #endif 479 cerr = WaitForCUDA();CHKERRCUDA(cerr); 480 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 481 482 /* assign the pointer */ 483 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 484 loTriFactor->AA_h = AALo; 485 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 486 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 487 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 488 } else { /* update values only */ 489 if (!loTriFactor->AA_h) { 490 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 491 } 492 /* Fill the lower triangular matrix */ 493 loTriFactor->AA_h[0] = 1.0; 494 v = aa; 495 vi = aj; 496 offset = 1; 497 for (i=1; i<n; i++) { 498 nz = ai[i+1] - ai[i]; 499 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 500 offset += nz; 501 loTriFactor->AA_h[offset] = 1.0; 502 offset += 1; 503 v += nz; 504 } 505 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 506 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 507 } 508 } catch(char *ex) { 509 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 510 } 511 } 512 PetscFunctionReturn(0); 513 } 514 515 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 516 { 517 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 518 PetscInt n = A->rmap->n; 519 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 520 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 521 cusparseStatus_t stat; 522 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 523 const MatScalar *aa = a->a,*v; 524 PetscInt *AiUp, *AjUp; 525 PetscInt i,nz, nzUpper, offset; 526 PetscErrorCode ierr; 527 cudaError_t cerr; 528 529 PetscFunctionBegin; 530 if (!n) PetscFunctionReturn(0); 531 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 532 try { 533 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 534 nzUpper = adiag[0]-adiag[n]; 535 if (!upTriFactor) { 536 PetscScalar *AAUp; 537 538 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 539 540 /* Allocate Space for the upper triangular matrix */ 541 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 542 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 543 544 /* Fill the upper triangular matrix */ 545 AiUp[0]=(PetscInt) 0; 546 AiUp[n]=nzUpper; 547 offset = nzUpper; 548 for (i=n-1; i>=0; i--) { 549 v = aa + adiag[i+1] + 1; 550 vi = aj + adiag[i+1] + 1; 551 552 /* number of elements NOT on the diagonal */ 553 nz = adiag[i] - adiag[i+1]-1; 554 555 /* decrement the offset */ 556 offset -= (nz+1); 557 558 /* first, set the diagonal elements */ 559 AjUp[offset] = (PetscInt) i; 560 AAUp[offset] = (MatScalar)1./v[nz]; 561 AiUp[i] = AiUp[i+1] - (nz+1); 562 563 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 564 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 565 } 566 567 /* allocate space for the triangular factor information */ 568 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 569 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 570 571 /* Create the matrix description */ 572 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 573 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 574 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 575 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 576 #else 577 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 578 #endif 579 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 580 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 581 582 /* set the operation */ 583 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 584 585 /* set the matrix */ 586 upTriFactor->csrMat = new CsrMatrix; 587 upTriFactor->csrMat->num_rows = n; 588 upTriFactor->csrMat->num_cols = n; 589 upTriFactor->csrMat->num_entries = nzUpper; 590 591 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 592 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 593 594 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 595 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 596 597 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 598 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 599 600 /* Create the solve analysis information */ 601 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 602 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 603 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 604 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 605 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 606 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 607 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 608 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 609 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 610 #endif 611 612 /* perform the solve analysis */ 613 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 614 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 615 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 616 upTriFactor->csrMat->column_indices->data().get(), 617 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 618 upTriFactor->solveInfo, 619 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 620 #else 621 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 622 #endif 623 cerr = WaitForCUDA();CHKERRCUDA(cerr); 624 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 625 626 /* assign the pointer */ 627 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 628 upTriFactor->AA_h = AAUp; 629 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 630 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 631 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 632 } else { 633 if (!upTriFactor->AA_h) { 634 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 635 } 636 /* Fill the upper triangular matrix */ 637 offset = nzUpper; 638 for (i=n-1; i>=0; i--) { 639 v = aa + adiag[i+1] + 1; 640 641 /* number of elements NOT on the diagonal */ 642 nz = adiag[i] - adiag[i+1]-1; 643 644 /* decrement the offset */ 645 offset -= (nz+1); 646 647 /* first, set the diagonal elements */ 648 upTriFactor->AA_h[offset] = 1./v[nz]; 649 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 650 } 651 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 652 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 653 } 654 } catch(char *ex) { 655 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 656 } 657 } 658 PetscFunctionReturn(0); 659 } 660 661 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 662 { 663 PetscErrorCode ierr; 664 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 665 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 666 IS isrow = a->row,iscol = a->icol; 667 PetscBool row_identity,col_identity; 668 PetscInt n = A->rmap->n; 669 670 PetscFunctionBegin; 671 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 672 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 673 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 674 675 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 676 cusparseTriFactors->nnz=a->nz; 677 678 A->offloadmask = PETSC_OFFLOAD_BOTH; 679 /* lower triangular indices */ 680 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 681 if (!row_identity && !cusparseTriFactors->rpermIndices) { 682 const PetscInt *r; 683 684 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 685 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 686 cusparseTriFactors->rpermIndices->assign(r, r+n); 687 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 688 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 689 } 690 691 /* upper triangular indices */ 692 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 693 if (!col_identity && !cusparseTriFactors->cpermIndices) { 694 const PetscInt *c; 695 696 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 697 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 698 cusparseTriFactors->cpermIndices->assign(c, c+n); 699 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 700 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 701 } 702 PetscFunctionReturn(0); 703 } 704 705 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 706 { 707 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 708 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 709 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 710 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 711 cusparseStatus_t stat; 712 PetscErrorCode ierr; 713 cudaError_t cerr; 714 PetscInt *AiUp, *AjUp; 715 PetscScalar *AAUp; 716 PetscScalar *AALo; 717 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 718 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 719 const PetscInt *ai = b->i,*aj = b->j,*vj; 720 const MatScalar *aa = b->a,*v; 721 722 PetscFunctionBegin; 723 if (!n) PetscFunctionReturn(0); 724 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 725 try { 726 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 727 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 728 if (!upTriFactor && !loTriFactor) { 729 /* Allocate Space for the upper triangular matrix */ 730 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 731 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 732 733 /* Fill the upper triangular matrix */ 734 AiUp[0]=(PetscInt) 0; 735 AiUp[n]=nzUpper; 736 offset = 0; 737 for (i=0; i<n; i++) { 738 /* set the pointers */ 739 v = aa + ai[i]; 740 vj = aj + ai[i]; 741 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 742 743 /* first, set the diagonal elements */ 744 AjUp[offset] = (PetscInt) i; 745 AAUp[offset] = (MatScalar)1.0/v[nz]; 746 AiUp[i] = offset; 747 AALo[offset] = (MatScalar)1.0/v[nz]; 748 749 offset+=1; 750 if (nz>0) { 751 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 752 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 753 for (j=offset; j<offset+nz; j++) { 754 AAUp[j] = -AAUp[j]; 755 AALo[j] = AAUp[j]/v[nz]; 756 } 757 offset+=nz; 758 } 759 } 760 761 /* allocate space for the triangular factor information */ 762 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 763 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 764 765 /* Create the matrix description */ 766 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 767 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 768 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 769 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 770 #else 771 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 772 #endif 773 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 774 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 775 776 /* set the matrix */ 777 upTriFactor->csrMat = new CsrMatrix; 778 upTriFactor->csrMat->num_rows = A->rmap->n; 779 upTriFactor->csrMat->num_cols = A->cmap->n; 780 upTriFactor->csrMat->num_entries = a->nz; 781 782 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 783 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 784 785 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 786 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 787 788 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 789 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 790 791 /* set the operation */ 792 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 793 794 /* Create the solve analysis information */ 795 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 796 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 797 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 798 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 799 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 800 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 801 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 802 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 803 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 804 #endif 805 806 /* perform the solve analysis */ 807 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 808 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 809 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 810 upTriFactor->csrMat->column_indices->data().get(), 811 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 812 upTriFactor->solveInfo, 813 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 814 #else 815 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 816 #endif 817 cerr = WaitForCUDA();CHKERRCUDA(cerr); 818 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 819 820 /* assign the pointer */ 821 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 822 823 /* allocate space for the triangular factor information */ 824 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 825 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 826 827 /* Create the matrix description */ 828 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 829 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 830 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 831 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 832 #else 833 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 834 #endif 835 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 836 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 837 838 /* set the operation */ 839 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 840 841 /* set the matrix */ 842 loTriFactor->csrMat = new CsrMatrix; 843 loTriFactor->csrMat->num_rows = A->rmap->n; 844 loTriFactor->csrMat->num_cols = A->cmap->n; 845 loTriFactor->csrMat->num_entries = a->nz; 846 847 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 848 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 849 850 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 851 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 852 853 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 854 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 855 856 /* Create the solve analysis information */ 857 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 858 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 859 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 860 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 861 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 862 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 863 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 864 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 865 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 866 #endif 867 868 /* perform the solve analysis */ 869 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 870 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 871 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 872 loTriFactor->csrMat->column_indices->data().get(), 873 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 874 loTriFactor->solveInfo, 875 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 876 #else 877 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 878 #endif 879 cerr = WaitForCUDA();CHKERRCUDA(cerr); 880 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 881 882 /* assign the pointer */ 883 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 884 885 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 886 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 887 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 888 } else { 889 /* Fill the upper triangular matrix */ 890 offset = 0; 891 for (i=0; i<n; i++) { 892 /* set the pointers */ 893 v = aa + ai[i]; 894 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 895 896 /* first, set the diagonal elements */ 897 AAUp[offset] = 1.0/v[nz]; 898 AALo[offset] = 1.0/v[nz]; 899 900 offset+=1; 901 if (nz>0) { 902 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 903 for (j=offset; j<offset+nz; j++) { 904 AAUp[j] = -AAUp[j]; 905 AALo[j] = AAUp[j]/v[nz]; 906 } 907 offset+=nz; 908 } 909 } 910 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 911 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 912 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 913 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 914 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 915 } 916 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 917 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 918 } catch(char *ex) { 919 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 920 } 921 } 922 PetscFunctionReturn(0); 923 } 924 925 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 926 { 927 PetscErrorCode ierr; 928 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 929 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 930 IS ip = a->row; 931 PetscBool perm_identity; 932 PetscInt n = A->rmap->n; 933 934 PetscFunctionBegin; 935 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 936 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 937 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 938 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 939 940 A->offloadmask = PETSC_OFFLOAD_BOTH; 941 942 /* lower triangular indices */ 943 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 944 if (!perm_identity) { 945 IS iip; 946 const PetscInt *irip,*rip; 947 948 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 949 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 950 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 951 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 952 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 953 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 954 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 955 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 956 ierr = ISDestroy(&iip);CHKERRQ(ierr); 957 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 958 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 959 } 960 PetscFunctionReturn(0); 961 } 962 963 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 964 { 965 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 966 IS ip = b->row; 967 PetscBool perm_identity; 968 PetscErrorCode ierr; 969 970 PetscFunctionBegin; 971 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 972 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 973 B->offloadmask = PETSC_OFFLOAD_CPU; 974 /* determine which version of MatSolve needs to be used. */ 975 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 976 if (perm_identity) { 977 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 978 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 979 B->ops->matsolve = NULL; 980 B->ops->matsolvetranspose = NULL; 981 } else { 982 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 983 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 984 B->ops->matsolve = NULL; 985 B->ops->matsolvetranspose = NULL; 986 } 987 988 /* get the triangular factors */ 989 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 990 PetscFunctionReturn(0); 991 } 992 993 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 994 { 995 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 996 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 997 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 998 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 999 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1000 cusparseStatus_t stat; 1001 cusparseIndexBase_t indexBase; 1002 cusparseMatrixType_t matrixType; 1003 cusparseFillMode_t fillMode; 1004 cusparseDiagType_t diagType; 1005 cudaError_t cerr; 1006 PetscErrorCode ierr; 1007 1008 PetscFunctionBegin; 1009 /* allocate space for the transpose of the lower triangular factor */ 1010 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1011 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1012 1013 /* set the matrix descriptors of the lower triangular factor */ 1014 matrixType = cusparseGetMatType(loTriFactor->descr); 1015 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1016 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1017 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1018 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1019 1020 /* Create the matrix description */ 1021 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1022 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1023 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1024 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1025 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1026 1027 /* set the operation */ 1028 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1029 1030 /* allocate GPU space for the CSC of the lower triangular factor*/ 1031 loTriFactorT->csrMat = new CsrMatrix; 1032 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1033 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1034 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1035 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1036 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1037 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1038 1039 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1040 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1041 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1042 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1043 loTriFactor->csrMat->values->data().get(), 1044 loTriFactor->csrMat->row_offsets->data().get(), 1045 loTriFactor->csrMat->column_indices->data().get(), 1046 loTriFactorT->csrMat->values->data().get(), 1047 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1048 CUSPARSE_ACTION_NUMERIC,indexBase, 1049 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1050 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1051 #endif 1052 1053 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1054 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1055 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1056 loTriFactor->csrMat->values->data().get(), 1057 loTriFactor->csrMat->row_offsets->data().get(), 1058 loTriFactor->csrMat->column_indices->data().get(), 1059 loTriFactorT->csrMat->values->data().get(), 1060 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1061 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1062 CUSPARSE_ACTION_NUMERIC, indexBase, 1063 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1064 #else 1065 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1066 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1067 #endif 1068 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1069 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1070 1071 /* Create the solve analysis information */ 1072 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1073 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1074 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1075 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1076 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1077 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1078 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1079 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1080 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1081 #endif 1082 1083 /* perform the solve analysis */ 1084 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1085 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1086 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1087 loTriFactorT->csrMat->column_indices->data().get(), 1088 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1089 loTriFactorT->solveInfo, 1090 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1091 #else 1092 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1093 #endif 1094 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1095 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1096 1097 /* assign the pointer */ 1098 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1099 1100 /*********************************************/ 1101 /* Now the Transpose of the Upper Tri Factor */ 1102 /*********************************************/ 1103 1104 /* allocate space for the transpose of the upper triangular factor */ 1105 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1106 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1107 1108 /* set the matrix descriptors of the upper triangular factor */ 1109 matrixType = cusparseGetMatType(upTriFactor->descr); 1110 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1111 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1112 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1113 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1114 1115 /* Create the matrix description */ 1116 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1117 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1118 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1119 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1120 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1121 1122 /* set the operation */ 1123 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1124 1125 /* allocate GPU space for the CSC of the upper triangular factor*/ 1126 upTriFactorT->csrMat = new CsrMatrix; 1127 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1128 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1129 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1130 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1131 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1132 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1133 1134 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1135 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1136 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1137 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1138 upTriFactor->csrMat->values->data().get(), 1139 upTriFactor->csrMat->row_offsets->data().get(), 1140 upTriFactor->csrMat->column_indices->data().get(), 1141 upTriFactorT->csrMat->values->data().get(), 1142 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1143 CUSPARSE_ACTION_NUMERIC,indexBase, 1144 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1145 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1146 #endif 1147 1148 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1149 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1150 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1151 upTriFactor->csrMat->values->data().get(), 1152 upTriFactor->csrMat->row_offsets->data().get(), 1153 upTriFactor->csrMat->column_indices->data().get(), 1154 upTriFactorT->csrMat->values->data().get(), 1155 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1156 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1157 CUSPARSE_ACTION_NUMERIC, indexBase, 1158 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1159 #else 1160 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1161 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1162 #endif 1163 1164 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1165 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1166 1167 /* Create the solve analysis information */ 1168 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1169 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1170 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1171 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1172 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1173 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1174 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1175 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1176 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1177 #endif 1178 1179 /* perform the solve analysis */ 1180 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1181 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1182 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1183 upTriFactorT->csrMat->column_indices->data().get(), 1184 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1185 upTriFactorT->solveInfo, 1186 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1187 #else 1188 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1189 #endif 1190 1191 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1192 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1193 1194 /* assign the pointer */ 1195 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1196 PetscFunctionReturn(0); 1197 } 1198 1199 struct PetscScalarToPetscInt 1200 { 1201 __host__ __device__ 1202 PetscInt operator()(PetscScalar s) 1203 { 1204 return (PetscInt)PetscRealPart(s); 1205 } 1206 }; 1207 1208 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A) 1209 { 1210 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1211 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1212 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1213 cusparseStatus_t stat; 1214 cusparseIndexBase_t indexBase; 1215 cudaError_t err; 1216 PetscErrorCode ierr; 1217 1218 PetscFunctionBegin; 1219 if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 1220 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1221 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1222 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1223 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1224 if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1225 if (A->transupdated) PetscFunctionReturn(0); 1226 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1227 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1228 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1229 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1230 } 1231 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1232 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1233 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1234 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1235 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1236 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1237 1238 /* set alpha and beta */ 1239 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1240 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1241 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1242 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1243 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1244 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1245 1246 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1247 CsrMatrix *matrixT = new CsrMatrix; 1248 matstructT->mat = matrixT; 1249 matrixT->num_rows = A->cmap->n; 1250 matrixT->num_cols = A->rmap->n; 1251 matrixT->num_entries = a->nz; 1252 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1253 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1254 matrixT->values = new THRUSTARRAY(a->nz); 1255 1256 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1257 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1258 1259 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1260 stat = cusparseCreateCsr(&matstructT->matDescr, 1261 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1262 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1263 matrixT->values->data().get(), 1264 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1265 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1266 #endif 1267 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1268 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1269 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1270 #else 1271 CsrMatrix *temp = new CsrMatrix; 1272 CsrMatrix *tempT = new CsrMatrix; 1273 /* First convert HYB to CSR */ 1274 temp->num_rows = A->rmap->n; 1275 temp->num_cols = A->cmap->n; 1276 temp->num_entries = a->nz; 1277 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1278 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1279 temp->values = new THRUSTARRAY(a->nz); 1280 1281 stat = cusparse_hyb2csr(cusparsestruct->handle, 1282 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1283 temp->values->data().get(), 1284 temp->row_offsets->data().get(), 1285 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1286 1287 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1288 tempT->num_rows = A->rmap->n; 1289 tempT->num_cols = A->cmap->n; 1290 tempT->num_entries = a->nz; 1291 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1292 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1293 tempT->values = new THRUSTARRAY(a->nz); 1294 1295 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1296 temp->num_cols, temp->num_entries, 1297 temp->values->data().get(), 1298 temp->row_offsets->data().get(), 1299 temp->column_indices->data().get(), 1300 tempT->values->data().get(), 1301 tempT->column_indices->data().get(), 1302 tempT->row_offsets->data().get(), 1303 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1304 1305 /* Last, convert CSC to HYB */ 1306 cusparseHybMat_t hybMat; 1307 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1308 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1309 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1310 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1311 matstructT->descr, tempT->values->data().get(), 1312 tempT->row_offsets->data().get(), 1313 tempT->column_indices->data().get(), 1314 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1315 1316 /* assign the pointer */ 1317 matstructT->mat = hybMat; 1318 A->transupdated = PETSC_TRUE; 1319 /* delete temporaries */ 1320 if (tempT) { 1321 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1322 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1323 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1324 delete (CsrMatrix*) tempT; 1325 } 1326 if (temp) { 1327 if (temp->values) delete (THRUSTARRAY*) temp->values; 1328 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1329 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1330 delete (CsrMatrix*) temp; 1331 } 1332 #endif 1333 } 1334 } 1335 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1336 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1337 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1338 if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1339 if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1340 if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1341 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1342 if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1343 if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1344 if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1345 if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1346 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1347 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1348 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1349 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1350 } 1351 if (!cusparsestruct->csr2csc_i) { 1352 THRUSTARRAY csr2csc_a(matrix->num_entries); 1353 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1354 1355 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1356 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1357 void *csr2cscBuffer; 1358 size_t csr2cscBufferSize; 1359 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1360 A->cmap->n, matrix->num_entries, 1361 matrix->values->data().get(), 1362 cusparsestruct->rowoffsets_gpu->data().get(), 1363 matrix->column_indices->data().get(), 1364 matrixT->values->data().get(), 1365 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1366 CUSPARSE_ACTION_NUMERIC,indexBase, 1367 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1368 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1369 #endif 1370 1371 if (matrix->num_entries) { 1372 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1373 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1374 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1375 1376 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1377 should be filled with indexBase. So I just take a shortcut here. 1378 */ 1379 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1380 A->cmap->n,matrix->num_entries, 1381 csr2csc_a.data().get(), 1382 cusparsestruct->rowoffsets_gpu->data().get(), 1383 matrix->column_indices->data().get(), 1384 matrixT->values->data().get(), 1385 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1386 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1387 CUSPARSE_ACTION_NUMERIC,indexBase, 1388 cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1389 #else 1390 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1391 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1392 #endif 1393 } else { 1394 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1395 } 1396 1397 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1398 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1399 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1400 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1401 #endif 1402 } 1403 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1404 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1405 matrixT->values->begin())); 1406 } 1407 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1408 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1409 /* the compressed row indices is not used for matTranspose */ 1410 matstructT->cprowIndices = NULL; 1411 /* assign the pointer */ 1412 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1413 A->transupdated = PETSC_TRUE; 1414 PetscFunctionReturn(0); 1415 } 1416 1417 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1418 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1419 { 1420 PetscInt n = xx->map->n; 1421 const PetscScalar *barray; 1422 PetscScalar *xarray; 1423 thrust::device_ptr<const PetscScalar> bGPU; 1424 thrust::device_ptr<PetscScalar> xGPU; 1425 cusparseStatus_t stat; 1426 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1427 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1428 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1429 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1430 PetscErrorCode ierr; 1431 1432 PetscFunctionBegin; 1433 /* Analyze the matrix and create the transpose ... on the fly */ 1434 if (!loTriFactorT && !upTriFactorT) { 1435 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1436 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1437 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1438 } 1439 1440 /* Get the GPU pointers */ 1441 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1442 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1443 xGPU = thrust::device_pointer_cast(xarray); 1444 bGPU = thrust::device_pointer_cast(barray); 1445 1446 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1447 /* First, reorder with the row permutation */ 1448 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1449 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1450 xGPU); 1451 1452 /* First, solve U */ 1453 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1454 upTriFactorT->csrMat->num_rows, 1455 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1456 upTriFactorT->csrMat->num_entries, 1457 #endif 1458 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1459 upTriFactorT->csrMat->values->data().get(), 1460 upTriFactorT->csrMat->row_offsets->data().get(), 1461 upTriFactorT->csrMat->column_indices->data().get(), 1462 upTriFactorT->solveInfo, 1463 xarray, 1464 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1465 tempGPU->data().get(), 1466 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1467 #else 1468 tempGPU->data().get());CHKERRCUSPARSE(stat); 1469 #endif 1470 1471 /* Then, solve L */ 1472 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1473 loTriFactorT->csrMat->num_rows, 1474 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1475 loTriFactorT->csrMat->num_entries, 1476 #endif 1477 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1478 loTriFactorT->csrMat->values->data().get(), 1479 loTriFactorT->csrMat->row_offsets->data().get(), 1480 loTriFactorT->csrMat->column_indices->data().get(), 1481 loTriFactorT->solveInfo, 1482 tempGPU->data().get(), 1483 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1484 xarray, 1485 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1486 #else 1487 xarray);CHKERRCUSPARSE(stat); 1488 #endif 1489 1490 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1491 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1492 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1493 tempGPU->begin()); 1494 1495 /* Copy the temporary to the full solution. */ 1496 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1497 1498 /* restore */ 1499 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1500 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1501 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1502 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1503 PetscFunctionReturn(0); 1504 } 1505 1506 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1507 { 1508 const PetscScalar *barray; 1509 PetscScalar *xarray; 1510 cusparseStatus_t stat; 1511 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1512 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1513 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1514 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1515 PetscErrorCode ierr; 1516 1517 PetscFunctionBegin; 1518 /* Analyze the matrix and create the transpose ... on the fly */ 1519 if (!loTriFactorT && !upTriFactorT) { 1520 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1521 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1522 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1523 } 1524 1525 /* Get the GPU pointers */ 1526 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1527 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1528 1529 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1530 /* First, solve U */ 1531 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1532 upTriFactorT->csrMat->num_rows, 1533 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1534 upTriFactorT->csrMat->num_entries, 1535 #endif 1536 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1537 upTriFactorT->csrMat->values->data().get(), 1538 upTriFactorT->csrMat->row_offsets->data().get(), 1539 upTriFactorT->csrMat->column_indices->data().get(), 1540 upTriFactorT->solveInfo, 1541 barray, 1542 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1543 tempGPU->data().get(), 1544 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1545 #else 1546 tempGPU->data().get());CHKERRCUSPARSE(stat); 1547 #endif 1548 1549 /* Then, solve L */ 1550 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1551 loTriFactorT->csrMat->num_rows, 1552 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1553 loTriFactorT->csrMat->num_entries, 1554 #endif 1555 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1556 loTriFactorT->csrMat->values->data().get(), 1557 loTriFactorT->csrMat->row_offsets->data().get(), 1558 loTriFactorT->csrMat->column_indices->data().get(), 1559 loTriFactorT->solveInfo, 1560 tempGPU->data().get(), 1561 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1562 xarray, 1563 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1564 #else 1565 xarray);CHKERRCUSPARSE(stat); 1566 #endif 1567 1568 /* restore */ 1569 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1570 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1571 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1572 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1573 PetscFunctionReturn(0); 1574 } 1575 1576 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1577 { 1578 const PetscScalar *barray; 1579 PetscScalar *xarray; 1580 thrust::device_ptr<const PetscScalar> bGPU; 1581 thrust::device_ptr<PetscScalar> xGPU; 1582 cusparseStatus_t stat; 1583 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1584 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1585 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1586 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1587 PetscErrorCode ierr; 1588 1589 PetscFunctionBegin; 1590 1591 /* Get the GPU pointers */ 1592 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1593 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1594 xGPU = thrust::device_pointer_cast(xarray); 1595 bGPU = thrust::device_pointer_cast(barray); 1596 1597 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1598 /* First, reorder with the row permutation */ 1599 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1600 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1601 tempGPU->begin()); 1602 1603 /* Next, solve L */ 1604 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1605 loTriFactor->csrMat->num_rows, 1606 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1607 loTriFactor->csrMat->num_entries, 1608 #endif 1609 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1610 loTriFactor->csrMat->values->data().get(), 1611 loTriFactor->csrMat->row_offsets->data().get(), 1612 loTriFactor->csrMat->column_indices->data().get(), 1613 loTriFactor->solveInfo, 1614 tempGPU->data().get(), 1615 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1616 xarray, 1617 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1618 #else 1619 xarray);CHKERRCUSPARSE(stat); 1620 #endif 1621 1622 /* Then, solve U */ 1623 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1624 upTriFactor->csrMat->num_rows, 1625 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1626 upTriFactor->csrMat->num_entries, 1627 #endif 1628 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1629 upTriFactor->csrMat->values->data().get(), 1630 upTriFactor->csrMat->row_offsets->data().get(), 1631 upTriFactor->csrMat->column_indices->data().get(), 1632 upTriFactor->solveInfo,xarray, 1633 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1634 tempGPU->data().get(), 1635 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1636 #else 1637 tempGPU->data().get());CHKERRCUSPARSE(stat); 1638 #endif 1639 1640 /* Last, reorder with the column permutation */ 1641 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1642 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1643 xGPU); 1644 1645 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1646 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1647 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1648 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1649 PetscFunctionReturn(0); 1650 } 1651 1652 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1653 { 1654 const PetscScalar *barray; 1655 PetscScalar *xarray; 1656 cusparseStatus_t stat; 1657 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1658 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1659 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1660 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1661 PetscErrorCode ierr; 1662 1663 PetscFunctionBegin; 1664 /* Get the GPU pointers */ 1665 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1666 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1667 1668 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1669 /* First, solve L */ 1670 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1671 loTriFactor->csrMat->num_rows, 1672 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1673 loTriFactor->csrMat->num_entries, 1674 #endif 1675 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1676 loTriFactor->csrMat->values->data().get(), 1677 loTriFactor->csrMat->row_offsets->data().get(), 1678 loTriFactor->csrMat->column_indices->data().get(), 1679 loTriFactor->solveInfo, 1680 barray, 1681 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1682 tempGPU->data().get(), 1683 loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1684 #else 1685 tempGPU->data().get());CHKERRCUSPARSE(stat); 1686 #endif 1687 1688 /* Next, solve U */ 1689 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1690 upTriFactor->csrMat->num_rows, 1691 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1692 upTriFactor->csrMat->num_entries, 1693 #endif 1694 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1695 upTriFactor->csrMat->values->data().get(), 1696 upTriFactor->csrMat->row_offsets->data().get(), 1697 upTriFactor->csrMat->column_indices->data().get(), 1698 upTriFactor->solveInfo, 1699 tempGPU->data().get(), 1700 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1701 xarray, 1702 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1703 #else 1704 xarray);CHKERRCUSPARSE(stat); 1705 #endif 1706 1707 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1708 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1709 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1710 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1711 PetscFunctionReturn(0); 1712 } 1713 1714 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1715 { 1716 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1717 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1718 cudaError_t cerr; 1719 PetscErrorCode ierr; 1720 1721 PetscFunctionBegin; 1722 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1723 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1724 1725 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1726 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1727 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1728 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1729 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1730 A->offloadmask = PETSC_OFFLOAD_BOTH; 1731 } 1732 PetscFunctionReturn(0); 1733 } 1734 1735 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1736 { 1737 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1738 PetscErrorCode ierr; 1739 1740 PetscFunctionBegin; 1741 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1742 *array = a->a; 1743 A->offloadmask = PETSC_OFFLOAD_CPU; 1744 PetscFunctionReturn(0); 1745 } 1746 1747 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1748 { 1749 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1750 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1751 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1752 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1753 PetscErrorCode ierr; 1754 cusparseStatus_t stat; 1755 PetscBool both = PETSC_TRUE; 1756 cudaError_t err; 1757 1758 PetscFunctionBegin; 1759 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1760 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1761 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1762 CsrMatrix *matrix; 1763 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1764 1765 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1766 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1767 matrix->values->assign(a->a, a->a+a->nz); 1768 err = WaitForCUDA();CHKERRCUDA(err); 1769 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1770 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1771 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1772 } else { 1773 PetscInt nnz; 1774 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1775 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1776 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1777 delete cusparsestruct->workVector; 1778 delete cusparsestruct->rowoffsets_gpu; 1779 cusparsestruct->workVector = NULL; 1780 cusparsestruct->rowoffsets_gpu = NULL; 1781 try { 1782 if (a->compressedrow.use) { 1783 m = a->compressedrow.nrows; 1784 ii = a->compressedrow.i; 1785 ridx = a->compressedrow.rindex; 1786 } else { 1787 m = A->rmap->n; 1788 ii = a->i; 1789 ridx = NULL; 1790 } 1791 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1792 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1793 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1794 else nnz = a->nz; 1795 1796 /* create cusparse matrix */ 1797 cusparsestruct->nrows = m; 1798 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1799 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1800 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1801 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1802 1803 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1804 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1805 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1806 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1807 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1808 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1809 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1810 1811 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1812 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1813 /* set the matrix */ 1814 CsrMatrix *mat= new CsrMatrix; 1815 mat->num_rows = m; 1816 mat->num_cols = A->cmap->n; 1817 mat->num_entries = nnz; 1818 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1819 mat->row_offsets->assign(ii, ii + m+1); 1820 1821 mat->column_indices = new THRUSTINTARRAY32(nnz); 1822 mat->column_indices->assign(a->j, a->j+nnz); 1823 1824 mat->values = new THRUSTARRAY(nnz); 1825 if (a->a) mat->values->assign(a->a, a->a+nnz); 1826 1827 /* assign the pointer */ 1828 matstruct->mat = mat; 1829 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1830 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1831 stat = cusparseCreateCsr(&matstruct->matDescr, 1832 mat->num_rows, mat->num_cols, mat->num_entries, 1833 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1834 mat->values->data().get(), 1835 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1836 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1837 } 1838 #endif 1839 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1840 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1841 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1842 #else 1843 CsrMatrix *mat= new CsrMatrix; 1844 mat->num_rows = m; 1845 mat->num_cols = A->cmap->n; 1846 mat->num_entries = nnz; 1847 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1848 mat->row_offsets->assign(ii, ii + m+1); 1849 1850 mat->column_indices = new THRUSTINTARRAY32(nnz); 1851 mat->column_indices->assign(a->j, a->j+nnz); 1852 1853 mat->values = new THRUSTARRAY(nnz); 1854 if (a->a) mat->values->assign(a->a, a->a+nnz); 1855 1856 cusparseHybMat_t hybMat; 1857 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1858 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1859 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1860 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1861 matstruct->descr, mat->values->data().get(), 1862 mat->row_offsets->data().get(), 1863 mat->column_indices->data().get(), 1864 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1865 /* assign the pointer */ 1866 matstruct->mat = hybMat; 1867 1868 if (mat) { 1869 if (mat->values) delete (THRUSTARRAY*)mat->values; 1870 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1871 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1872 delete (CsrMatrix*)mat; 1873 } 1874 #endif 1875 } 1876 1877 /* assign the compressed row indices */ 1878 if (a->compressedrow.use) { 1879 cusparsestruct->workVector = new THRUSTARRAY(m); 1880 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1881 matstruct->cprowIndices->assign(ridx,ridx+m); 1882 tmp = m; 1883 } else { 1884 cusparsestruct->workVector = NULL; 1885 matstruct->cprowIndices = NULL; 1886 tmp = 0; 1887 } 1888 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1889 1890 /* assign the pointer */ 1891 cusparsestruct->mat = matstruct; 1892 } catch(char *ex) { 1893 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1894 } 1895 err = WaitForCUDA();CHKERRCUDA(err); 1896 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1897 cusparsestruct->nonzerostate = A->nonzerostate; 1898 } 1899 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1900 } 1901 PetscFunctionReturn(0); 1902 } 1903 1904 struct VecCUDAPlusEquals 1905 { 1906 template <typename Tuple> 1907 __host__ __device__ 1908 void operator()(Tuple t) 1909 { 1910 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1911 } 1912 }; 1913 1914 struct VecCUDAEquals 1915 { 1916 template <typename Tuple> 1917 __host__ __device__ 1918 void operator()(Tuple t) 1919 { 1920 thrust::get<1>(t) = thrust::get<0>(t); 1921 } 1922 }; 1923 1924 struct VecCUDAEqualsReverse 1925 { 1926 template <typename Tuple> 1927 __host__ __device__ 1928 void operator()(Tuple t) 1929 { 1930 thrust::get<0>(t) = thrust::get<1>(t); 1931 } 1932 }; 1933 1934 struct MatMatCusparse { 1935 PetscBool cisdense; 1936 PetscScalar *Bt; 1937 Mat X; 1938 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1939 PetscLogDouble flops; 1940 CsrMatrix *Bcsr; 1941 1942 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1943 cusparseSpMatDescr_t matSpBDescr; 1944 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1945 cusparseDnMatDescr_t matBDescr; 1946 cusparseDnMatDescr_t matCDescr; 1947 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1948 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 1949 void *dBuffer4; 1950 void *dBuffer5; 1951 #endif 1952 size_t mmBufferSize; 1953 void *mmBuffer; 1954 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1955 cusparseSpGEMMDescr_t spgemmDesc; 1956 #endif 1957 }; 1958 1959 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1960 { 1961 PetscErrorCode ierr; 1962 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1963 cudaError_t cerr; 1964 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1965 cusparseStatus_t stat; 1966 #endif 1967 1968 PetscFunctionBegin; 1969 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1970 delete mmdata->Bcsr; 1971 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1972 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1973 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1974 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1975 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1976 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 1977 if (mmdata->dBuffer4) { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); } 1978 if (mmdata->dBuffer5) { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); } 1979 #endif 1980 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1981 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1982 #endif 1983 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1984 ierr = PetscFree(data);CHKERRQ(ierr); 1985 PetscFunctionReturn(0); 1986 } 1987 1988 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1989 1990 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1991 { 1992 Mat_Product *product = C->product; 1993 Mat A,B; 1994 PetscInt m,n,blda,clda; 1995 PetscBool flg,biscuda; 1996 Mat_SeqAIJCUSPARSE *cusp; 1997 cusparseStatus_t stat; 1998 cusparseOperation_t opA; 1999 const PetscScalar *barray; 2000 PetscScalar *carray; 2001 PetscErrorCode ierr; 2002 MatMatCusparse *mmdata; 2003 Mat_SeqAIJCUSPARSEMultStruct *mat; 2004 CsrMatrix *csrmat; 2005 2006 PetscFunctionBegin; 2007 MatCheckProduct(C,1); 2008 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2009 mmdata = (MatMatCusparse*)product->data; 2010 A = product->A; 2011 B = product->B; 2012 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2013 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2014 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2015 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2016 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2017 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2018 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2019 switch (product->type) { 2020 case MATPRODUCT_AB: 2021 case MATPRODUCT_PtAP: 2022 mat = cusp->mat; 2023 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2024 m = A->rmap->n; 2025 n = B->cmap->n; 2026 break; 2027 case MATPRODUCT_AtB: 2028 if (!A->form_explicit_transpose) { 2029 mat = cusp->mat; 2030 opA = CUSPARSE_OPERATION_TRANSPOSE; 2031 } else { 2032 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2033 mat = cusp->matTranspose; 2034 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2035 } 2036 m = A->cmap->n; 2037 n = B->cmap->n; 2038 break; 2039 case MATPRODUCT_ABt: 2040 case MATPRODUCT_RARt: 2041 mat = cusp->mat; 2042 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2043 m = A->rmap->n; 2044 n = B->rmap->n; 2045 break; 2046 default: 2047 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2048 } 2049 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2050 csrmat = (CsrMatrix*)mat->mat; 2051 /* if the user passed a CPU matrix, copy the data to the GPU */ 2052 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2053 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2054 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2055 2056 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2057 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2058 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2059 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2060 } else { 2061 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2062 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2063 } 2064 2065 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2066 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2067 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2068 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2069 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2070 size_t mmBufferSize; 2071 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2072 if (!mmdata->matBDescr) { 2073 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2074 mmdata->Blda = blda; 2075 } 2076 2077 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2078 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2079 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2080 mmdata->Clda = clda; 2081 } 2082 2083 if (!mat->matDescr) { 2084 stat = cusparseCreateCsr(&mat->matDescr, 2085 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2086 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2087 csrmat->values->data().get(), 2088 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2089 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2090 } 2091 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2092 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2093 mmdata->matCDescr,cusparse_scalartype, 2094 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2095 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2096 cudaError_t cerr; 2097 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2098 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2099 mmdata->mmBufferSize = mmBufferSize; 2100 } 2101 mmdata->initialized = PETSC_TRUE; 2102 } else { 2103 /* to be safe, always update pointers of the mats */ 2104 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2105 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2106 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2107 } 2108 2109 /* do cusparseSpMM, which supports transpose on B */ 2110 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2111 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2112 mmdata->matCDescr,cusparse_scalartype, 2113 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2114 #else 2115 PetscInt k; 2116 /* cusparseXcsrmm does not support transpose on B */ 2117 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2118 cublasHandle_t cublasv2handle; 2119 cublasStatus_t cerr; 2120 2121 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2122 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2123 B->cmap->n,B->rmap->n, 2124 &PETSC_CUSPARSE_ONE ,barray,blda, 2125 &PETSC_CUSPARSE_ZERO,barray,blda, 2126 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2127 blda = B->cmap->n; 2128 k = B->cmap->n; 2129 } else { 2130 k = B->rmap->n; 2131 } 2132 2133 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2134 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2135 csrmat->num_entries,mat->alpha_one,mat->descr, 2136 csrmat->values->data().get(), 2137 csrmat->row_offsets->data().get(), 2138 csrmat->column_indices->data().get(), 2139 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2140 carray,clda);CHKERRCUSPARSE(stat); 2141 #endif 2142 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2143 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2144 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2145 if (product->type == MATPRODUCT_RARt) { 2146 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2147 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2148 } else if (product->type == MATPRODUCT_PtAP) { 2149 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2150 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2151 } else { 2152 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2153 } 2154 if (mmdata->cisdense) { 2155 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2156 } 2157 if (!biscuda) { 2158 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2159 } 2160 PetscFunctionReturn(0); 2161 } 2162 2163 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2164 { 2165 Mat_Product *product = C->product; 2166 Mat A,B; 2167 PetscInt m,n; 2168 PetscBool cisdense,flg; 2169 PetscErrorCode ierr; 2170 MatMatCusparse *mmdata; 2171 Mat_SeqAIJCUSPARSE *cusp; 2172 2173 PetscFunctionBegin; 2174 MatCheckProduct(C,1); 2175 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2176 A = product->A; 2177 B = product->B; 2178 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2179 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2180 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2181 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2182 switch (product->type) { 2183 case MATPRODUCT_AB: 2184 m = A->rmap->n; 2185 n = B->cmap->n; 2186 break; 2187 case MATPRODUCT_AtB: 2188 m = A->cmap->n; 2189 n = B->cmap->n; 2190 break; 2191 case MATPRODUCT_ABt: 2192 m = A->rmap->n; 2193 n = B->rmap->n; 2194 break; 2195 case MATPRODUCT_PtAP: 2196 m = B->cmap->n; 2197 n = B->cmap->n; 2198 break; 2199 case MATPRODUCT_RARt: 2200 m = B->rmap->n; 2201 n = B->rmap->n; 2202 break; 2203 default: 2204 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2205 } 2206 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2207 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2208 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2209 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2210 2211 /* product data */ 2212 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2213 mmdata->cisdense = cisdense; 2214 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2215 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2216 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2217 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2218 } 2219 #endif 2220 /* for these products we need intermediate storage */ 2221 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2222 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2223 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2224 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2225 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2226 } else { 2227 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2228 } 2229 } 2230 C->product->data = mmdata; 2231 C->product->destroy = MatDestroy_MatMatCusparse; 2232 2233 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2234 PetscFunctionReturn(0); 2235 } 2236 2237 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2238 { 2239 Mat_Product *product = C->product; 2240 Mat A,B; 2241 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2242 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2243 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2244 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2245 PetscBool flg; 2246 PetscErrorCode ierr; 2247 cusparseStatus_t stat; 2248 cudaError_t cerr; 2249 MatProductType ptype; 2250 MatMatCusparse *mmdata; 2251 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2252 cusparseSpMatDescr_t BmatSpDescr; 2253 #endif 2254 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2255 2256 PetscFunctionBegin; 2257 MatCheckProduct(C,1); 2258 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2259 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2260 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2261 mmdata = (MatMatCusparse*)C->product->data; 2262 A = product->A; 2263 B = product->B; 2264 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2265 mmdata->reusesym = PETSC_FALSE; 2266 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2267 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2268 Cmat = Ccusp->mat; 2269 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2270 Ccsr = (CsrMatrix*)Cmat->mat; 2271 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2272 goto finalize; 2273 } 2274 if (!c->nz) goto finalize; 2275 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2276 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2277 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2278 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2279 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2280 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2281 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2282 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2283 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2284 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2285 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2286 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2287 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2288 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2289 2290 ptype = product->type; 2291 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2292 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2293 switch (ptype) { 2294 case MATPRODUCT_AB: 2295 Amat = Acusp->mat; 2296 Bmat = Bcusp->mat; 2297 break; 2298 case MATPRODUCT_AtB: 2299 Amat = Acusp->matTranspose; 2300 Bmat = Bcusp->mat; 2301 break; 2302 case MATPRODUCT_ABt: 2303 Amat = Acusp->mat; 2304 Bmat = Bcusp->matTranspose; 2305 break; 2306 default: 2307 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2308 } 2309 Cmat = Ccusp->mat; 2310 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2311 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2312 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2313 Acsr = (CsrMatrix*)Amat->mat; 2314 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2315 Ccsr = (CsrMatrix*)Cmat->mat; 2316 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2317 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2318 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2319 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2320 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2321 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2322 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2323 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2324 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2325 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2326 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2327 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2328 #else 2329 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2330 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2331 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2332 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2333 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2334 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2335 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2336 #endif 2337 #else 2338 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2339 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2340 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2341 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2342 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2343 #endif 2344 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2345 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2346 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2347 C->offloadmask = PETSC_OFFLOAD_GPU; 2348 finalize: 2349 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2350 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2351 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2352 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2353 c->reallocs = 0; 2354 C->info.mallocs += 0; 2355 C->info.nz_unneeded = 0; 2356 C->assembled = C->was_assembled = PETSC_TRUE; 2357 C->num_ass++; 2358 PetscFunctionReturn(0); 2359 } 2360 2361 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2362 { 2363 Mat_Product *product = C->product; 2364 Mat A,B; 2365 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2366 Mat_SeqAIJ *a,*b,*c; 2367 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2368 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2369 PetscInt i,j,m,n,k; 2370 PetscBool flg; 2371 PetscErrorCode ierr; 2372 cusparseStatus_t stat; 2373 cudaError_t cerr; 2374 MatProductType ptype; 2375 MatMatCusparse *mmdata; 2376 PetscLogDouble flops; 2377 PetscBool biscompressed,ciscompressed; 2378 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2379 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2380 cusparseSpMatDescr_t BmatSpDescr; 2381 #else 2382 int cnz; 2383 #endif 2384 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2385 2386 PetscFunctionBegin; 2387 MatCheckProduct(C,1); 2388 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2389 A = product->A; 2390 B = product->B; 2391 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2392 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2393 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2394 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2395 a = (Mat_SeqAIJ*)A->data; 2396 b = (Mat_SeqAIJ*)B->data; 2397 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2398 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2399 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2400 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2401 2402 /* product data */ 2403 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2404 C->product->data = mmdata; 2405 C->product->destroy = MatDestroy_MatMatCusparse; 2406 2407 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2408 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2409 ptype = product->type; 2410 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2411 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2412 biscompressed = PETSC_FALSE; 2413 ciscompressed = PETSC_FALSE; 2414 switch (ptype) { 2415 case MATPRODUCT_AB: 2416 m = A->rmap->n; 2417 n = B->cmap->n; 2418 k = A->cmap->n; 2419 Amat = Acusp->mat; 2420 Bmat = Bcusp->mat; 2421 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2422 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2423 break; 2424 case MATPRODUCT_AtB: 2425 m = A->cmap->n; 2426 n = B->cmap->n; 2427 k = A->rmap->n; 2428 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2429 Amat = Acusp->matTranspose; 2430 Bmat = Bcusp->mat; 2431 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2432 break; 2433 case MATPRODUCT_ABt: 2434 m = A->rmap->n; 2435 n = B->rmap->n; 2436 k = A->cmap->n; 2437 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 2438 Amat = Acusp->mat; 2439 Bmat = Bcusp->matTranspose; 2440 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2441 break; 2442 default: 2443 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2444 } 2445 2446 /* create cusparse matrix */ 2447 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2448 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2449 c = (Mat_SeqAIJ*)C->data; 2450 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2451 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2452 Ccsr = new CsrMatrix; 2453 2454 c->compressedrow.use = ciscompressed; 2455 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2456 c->compressedrow.nrows = a->compressedrow.nrows; 2457 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2458 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2459 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2460 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2461 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2462 } else { 2463 c->compressedrow.nrows = 0; 2464 c->compressedrow.i = NULL; 2465 c->compressedrow.rindex = NULL; 2466 Ccusp->workVector = NULL; 2467 Cmat->cprowIndices = NULL; 2468 } 2469 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2470 Ccusp->mat = Cmat; 2471 Ccusp->mat->mat = Ccsr; 2472 Ccsr->num_rows = Ccusp->nrows; 2473 Ccsr->num_cols = n; 2474 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2475 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2476 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2477 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2478 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2479 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2480 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2481 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2482 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2483 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2484 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2485 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2486 c->nz = 0; 2487 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2488 Ccsr->values = new THRUSTARRAY(c->nz); 2489 goto finalizesym; 2490 } 2491 2492 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2493 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2494 Acsr = (CsrMatrix*)Amat->mat; 2495 if (!biscompressed) { 2496 Bcsr = (CsrMatrix*)Bmat->mat; 2497 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2498 BmatSpDescr = Bmat->matDescr; 2499 #endif 2500 } else { /* we need to use row offsets for the full matrix */ 2501 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2502 Bcsr = new CsrMatrix; 2503 Bcsr->num_rows = B->rmap->n; 2504 Bcsr->num_cols = cBcsr->num_cols; 2505 Bcsr->num_entries = cBcsr->num_entries; 2506 Bcsr->column_indices = cBcsr->column_indices; 2507 Bcsr->values = cBcsr->values; 2508 if (!Bcusp->rowoffsets_gpu) { 2509 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2510 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2511 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2512 } 2513 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2514 mmdata->Bcsr = Bcsr; 2515 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2516 if (Bcsr->num_rows && Bcsr->num_cols) { 2517 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2518 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2519 Bcsr->values->data().get(), 2520 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2521 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2522 } 2523 BmatSpDescr = mmdata->matSpBDescr; 2524 #endif 2525 } 2526 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2527 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2528 /* precompute flops count */ 2529 if (ptype == MATPRODUCT_AB) { 2530 for (i=0, flops = 0; i<A->rmap->n; i++) { 2531 const PetscInt st = a->i[i]; 2532 const PetscInt en = a->i[i+1]; 2533 for (j=st; j<en; j++) { 2534 const PetscInt brow = a->j[j]; 2535 flops += 2.*(b->i[brow+1] - b->i[brow]); 2536 } 2537 } 2538 } else if (ptype == MATPRODUCT_AtB) { 2539 for (i=0, flops = 0; i<A->rmap->n; i++) { 2540 const PetscInt anzi = a->i[i+1] - a->i[i]; 2541 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2542 flops += (2.*anzi)*bnzi; 2543 } 2544 } else { /* TODO */ 2545 flops = 0.; 2546 } 2547 2548 mmdata->flops = flops; 2549 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2550 2551 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2552 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2553 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2554 NULL, NULL, NULL, 2555 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2556 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2557 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2558 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2559 { 2560 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2561 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2562 */ 2563 void* dBuffer1 = NULL; 2564 void* dBuffer2 = NULL; 2565 void* dBuffer3 = NULL; 2566 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2567 size_t bufferSize1 = 0; 2568 size_t bufferSize2 = 0; 2569 size_t bufferSize3 = 0; 2570 size_t bufferSize4 = 0; 2571 size_t bufferSize5 = 0; 2572 2573 /*----------------------------------------------------------------------*/ 2574 /* ask bufferSize1 bytes for external memory */ 2575 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2576 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2577 &bufferSize1, NULL);CHKERRCUSPARSE(stat); 2578 cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr); 2579 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2580 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2581 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2582 &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat); 2583 2584 /*----------------------------------------------------------------------*/ 2585 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2586 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2587 &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat); 2588 cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr); 2589 cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr); 2590 cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr); 2591 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2592 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2593 &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat); 2594 cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr); 2595 cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr); 2596 2597 /*----------------------------------------------------------------------*/ 2598 /* get matrix C non-zero entries C_nnz1 */ 2599 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2600 c->nz = (PetscInt) C_nnz1; 2601 /* allocate matrix C */ 2602 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2603 Ccsr->values = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2604 /* update matC with the new pointers */ 2605 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2606 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2607 2608 /*----------------------------------------------------------------------*/ 2609 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2610 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2611 &bufferSize5, NULL);CHKERRCUSPARSE(stat); 2612 cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr); 2613 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2614 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2615 &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat); 2616 cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr); 2617 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2618 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2619 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2620 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2621 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr); 2622 } 2623 #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2624 size_t bufSize2; 2625 /* ask bufferSize bytes for external memory */ 2626 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2627 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2628 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2629 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2630 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2631 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2632 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2633 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2634 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2635 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2636 /* ask bufferSize again bytes for external memory */ 2637 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2638 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2639 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2640 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2641 /* The CUSPARSE documentation is not clear, nor the API 2642 We need both buffers to perform the operations properly! 2643 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2644 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2645 is stored in the descriptor! What a messy API... */ 2646 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2647 /* compute the intermediate product of A * B */ 2648 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2649 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2650 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2651 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2652 /* get matrix C non-zero entries C_nnz1 */ 2653 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2654 c->nz = (PetscInt) C_nnz1; 2655 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2656 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2657 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2658 Ccsr->values = new THRUSTARRAY(c->nz); 2659 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2660 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2661 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2662 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2663 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2664 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2665 #endif 2666 #else 2667 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2668 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2669 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2670 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2671 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2672 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2673 c->nz = cnz; 2674 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2675 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2676 Ccsr->values = new THRUSTARRAY(c->nz); 2677 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2678 2679 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2680 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2681 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2682 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2683 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2684 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2685 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2686 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2687 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2688 #endif 2689 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2690 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2691 finalizesym: 2692 c->singlemalloc = PETSC_FALSE; 2693 c->free_a = PETSC_TRUE; 2694 c->free_ij = PETSC_TRUE; 2695 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2696 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2697 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2698 PetscInt *d_i = c->i; 2699 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2700 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2701 ii = *Ccsr->row_offsets; 2702 jj = *Ccsr->column_indices; 2703 if (ciscompressed) d_i = c->compressedrow.i; 2704 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2705 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2706 } else { 2707 PetscInt *d_i = c->i; 2708 if (ciscompressed) d_i = c->compressedrow.i; 2709 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2710 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2711 } 2712 if (ciscompressed) { /* need to expand host row offsets */ 2713 PetscInt r = 0; 2714 c->i[0] = 0; 2715 for (k = 0; k < c->compressedrow.nrows; k++) { 2716 const PetscInt next = c->compressedrow.rindex[k]; 2717 const PetscInt old = c->compressedrow.i[k]; 2718 for (; r < next; r++) c->i[r+1] = old; 2719 } 2720 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2721 } 2722 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2723 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2724 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2725 c->maxnz = c->nz; 2726 c->nonzerorowcnt = 0; 2727 c->rmax = 0; 2728 for (k = 0; k < m; k++) { 2729 const PetscInt nn = c->i[k+1] - c->i[k]; 2730 c->ilen[k] = c->imax[k] = nn; 2731 c->nonzerorowcnt += (PetscInt)!!nn; 2732 c->rmax = PetscMax(c->rmax,nn); 2733 } 2734 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2735 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2736 Ccsr->num_entries = c->nz; 2737 2738 C->nonzerostate++; 2739 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2740 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2741 Ccusp->nonzerostate = C->nonzerostate; 2742 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2743 C->preallocated = PETSC_TRUE; 2744 C->assembled = PETSC_FALSE; 2745 C->was_assembled = PETSC_FALSE; 2746 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2747 mmdata->reusesym = PETSC_TRUE; 2748 C->offloadmask = PETSC_OFFLOAD_GPU; 2749 } 2750 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2751 PetscFunctionReturn(0); 2752 } 2753 2754 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2755 2756 /* handles sparse or dense B */ 2757 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2758 { 2759 Mat_Product *product = mat->product; 2760 PetscErrorCode ierr; 2761 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2762 2763 PetscFunctionBegin; 2764 MatCheckProduct(mat,1); 2765 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2766 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2767 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2768 } 2769 if (product->type == MATPRODUCT_ABC) { 2770 Ciscusp = PETSC_FALSE; 2771 if (!product->C->boundtocpu) { 2772 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2773 } 2774 } 2775 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 2776 PetscBool usecpu = PETSC_FALSE; 2777 switch (product->type) { 2778 case MATPRODUCT_AB: 2779 if (product->api_user) { 2780 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 2781 ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2782 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2783 } else { 2784 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 2785 ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2786 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2787 } 2788 break; 2789 case MATPRODUCT_AtB: 2790 if (product->api_user) { 2791 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 2792 ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2793 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2794 } else { 2795 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 2796 ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2797 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2798 } 2799 break; 2800 case MATPRODUCT_PtAP: 2801 if (product->api_user) { 2802 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 2803 ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2804 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2805 } else { 2806 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 2807 ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2808 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2809 } 2810 break; 2811 case MATPRODUCT_RARt: 2812 if (product->api_user) { 2813 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 2814 ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2815 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2816 } else { 2817 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 2818 ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2819 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2820 } 2821 break; 2822 case MATPRODUCT_ABC: 2823 if (product->api_user) { 2824 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 2825 ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2826 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2827 } else { 2828 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 2829 ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2830 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2831 } 2832 break; 2833 default: 2834 break; 2835 } 2836 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 2837 } 2838 /* dispatch */ 2839 if (isdense) { 2840 switch (product->type) { 2841 case MATPRODUCT_AB: 2842 case MATPRODUCT_AtB: 2843 case MATPRODUCT_ABt: 2844 case MATPRODUCT_PtAP: 2845 case MATPRODUCT_RARt: 2846 if (product->A->boundtocpu) { 2847 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2848 } else { 2849 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2850 } 2851 break; 2852 case MATPRODUCT_ABC: 2853 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2854 break; 2855 default: 2856 break; 2857 } 2858 } else if (Biscusp && Ciscusp) { 2859 switch (product->type) { 2860 case MATPRODUCT_AB: 2861 case MATPRODUCT_AtB: 2862 case MATPRODUCT_ABt: 2863 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2864 break; 2865 case MATPRODUCT_PtAP: 2866 case MATPRODUCT_RARt: 2867 case MATPRODUCT_ABC: 2868 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2869 break; 2870 default: 2871 break; 2872 } 2873 } else { /* fallback for AIJ */ 2874 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2875 } 2876 PetscFunctionReturn(0); 2877 } 2878 2879 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2880 { 2881 PetscErrorCode ierr; 2882 2883 PetscFunctionBegin; 2884 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2885 PetscFunctionReturn(0); 2886 } 2887 2888 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2889 { 2890 PetscErrorCode ierr; 2891 2892 PetscFunctionBegin; 2893 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2894 PetscFunctionReturn(0); 2895 } 2896 2897 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2898 { 2899 PetscErrorCode ierr; 2900 2901 PetscFunctionBegin; 2902 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2903 PetscFunctionReturn(0); 2904 } 2905 2906 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2907 { 2908 PetscErrorCode ierr; 2909 2910 PetscFunctionBegin; 2911 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2912 PetscFunctionReturn(0); 2913 } 2914 2915 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2916 { 2917 PetscErrorCode ierr; 2918 2919 PetscFunctionBegin; 2920 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2921 PetscFunctionReturn(0); 2922 } 2923 2924 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2925 { 2926 int i = blockIdx.x*blockDim.x + threadIdx.x; 2927 if (i < n) y[idx[i]] += x[i]; 2928 } 2929 2930 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2931 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2932 { 2933 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2934 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2935 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2936 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2937 PetscErrorCode ierr; 2938 cusparseStatus_t stat; 2939 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2940 PetscBool compressed; 2941 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2942 PetscInt nx,ny; 2943 #endif 2944 2945 PetscFunctionBegin; 2946 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 2947 if (!a->nonzerorowcnt) { 2948 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2949 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2950 PetscFunctionReturn(0); 2951 } 2952 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2953 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2954 if (!trans) { 2955 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2956 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2957 } else { 2958 if (herm || !A->form_explicit_transpose) { 2959 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2960 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2961 } else { 2962 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);} 2963 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2964 } 2965 } 2966 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2967 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2968 2969 try { 2970 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2971 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2972 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2973 2974 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2975 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2976 /* z = A x + beta y. 2977 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2978 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2979 */ 2980 xptr = xarray; 2981 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2982 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2983 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2984 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2985 allocated to accommodate different uses. So we get the length info directly from mat. 2986 */ 2987 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2988 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2989 nx = mat->num_cols; 2990 ny = mat->num_rows; 2991 } 2992 #endif 2993 } else { 2994 /* z = A^T x + beta y 2995 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2996 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2997 */ 2998 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2999 dptr = zarray; 3000 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3001 if (compressed) { /* Scatter x to work vector */ 3002 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3003 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3004 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3005 VecCUDAEqualsReverse()); 3006 } 3007 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3008 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3009 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3010 nx = mat->num_rows; 3011 ny = mat->num_cols; 3012 } 3013 #endif 3014 } 3015 3016 /* csr_spmv does y = alpha op(A) x + beta y */ 3017 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3018 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3019 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3020 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3021 cudaError_t cerr; 3022 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3023 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3024 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3025 matstruct->matDescr, 3026 matstruct->cuSpMV[opA].vecXDescr, beta, 3027 matstruct->cuSpMV[opA].vecYDescr, 3028 cusparse_scalartype, 3029 cusparsestruct->spmvAlg, 3030 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 3031 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 3032 3033 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3034 } else { 3035 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3036 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 3037 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 3038 } 3039 3040 stat = cusparseSpMV(cusparsestruct->handle, opA, 3041 matstruct->alpha_one, 3042 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */ 3043 matstruct->cuSpMV[opA].vecXDescr, 3044 beta, 3045 matstruct->cuSpMV[opA].vecYDescr, 3046 cusparse_scalartype, 3047 cusparsestruct->spmvAlg, 3048 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 3049 #else 3050 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3051 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 3052 mat->num_rows, mat->num_cols, 3053 mat->num_entries, matstruct->alpha_one, matstruct->descr, 3054 mat->values->data().get(), mat->row_offsets->data().get(), 3055 mat->column_indices->data().get(), xptr, beta, 3056 dptr);CHKERRCUSPARSE(stat); 3057 #endif 3058 } else { 3059 if (cusparsestruct->nrows) { 3060 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3061 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3062 #else 3063 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3064 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 3065 matstruct->alpha_one, matstruct->descr, hybMat, 3066 xptr, beta, 3067 dptr);CHKERRCUSPARSE(stat); 3068 #endif 3069 } 3070 } 3071 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3072 3073 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3074 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3075 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3076 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 3077 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3078 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3079 } 3080 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3081 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 3082 } 3083 3084 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3085 if (compressed) { 3086 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3087 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3088 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3089 prevent that. So I just add a ScatterAdd kernel. 3090 */ 3091 #if 0 3092 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3093 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3094 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3095 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3096 VecCUDAPlusEquals()); 3097 #else 3098 PetscInt n = matstruct->cprowIndices->size(); 3099 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3100 #endif 3101 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3102 } 3103 } else { 3104 if (yy && yy != zz) { 3105 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3106 } 3107 } 3108 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3109 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 3110 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 3111 } catch(char *ex) { 3112 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3113 } 3114 if (yy) { 3115 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 3116 } else { 3117 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 3118 } 3119 PetscFunctionReturn(0); 3120 } 3121 3122 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3123 { 3124 PetscErrorCode ierr; 3125 3126 PetscFunctionBegin; 3127 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3128 PetscFunctionReturn(0); 3129 } 3130 3131 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3132 { 3133 PetscErrorCode ierr; 3134 PetscObjectState onnz = A->nonzerostate; 3135 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3136 3137 PetscFunctionBegin; 3138 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 3139 if (onnz != A->nonzerostate && cusp->deviceMat) { 3140 cudaError_t cerr; 3141 3142 ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 3143 cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr); 3144 cusp->deviceMat = NULL; 3145 } 3146 PetscFunctionReturn(0); 3147 } 3148 3149 /* --------------------------------------------------------------------------------*/ 3150 /*@ 3151 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3152 (the default parallel PETSc format). This matrix will ultimately pushed down 3153 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3154 assembly performance the user should preallocate the matrix storage by setting 3155 the parameter nz (or the array nnz). By setting these parameters accurately, 3156 performance during matrix assembly can be increased by more than a factor of 50. 3157 3158 Collective 3159 3160 Input Parameters: 3161 + comm - MPI communicator, set to PETSC_COMM_SELF 3162 . m - number of rows 3163 . n - number of columns 3164 . nz - number of nonzeros per row (same for all rows) 3165 - nnz - array containing the number of nonzeros in the various rows 3166 (possibly different for each row) or NULL 3167 3168 Output Parameter: 3169 . A - the matrix 3170 3171 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3172 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3173 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3174 3175 Notes: 3176 If nnz is given then nz is ignored 3177 3178 The AIJ format (also called the Yale sparse matrix format or 3179 compressed row storage), is fully compatible with standard Fortran 77 3180 storage. That is, the stored row and column indices can begin at 3181 either one (as in Fortran) or zero. See the users' manual for details. 3182 3183 Specify the preallocated storage with either nz or nnz (not both). 3184 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3185 allocation. For large problems you MUST preallocate memory or you 3186 will get TERRIBLE performance, see the users' manual chapter on matrices. 3187 3188 By default, this format uses inodes (identical nodes) when possible, to 3189 improve numerical efficiency of matrix-vector products and solves. We 3190 search for consecutive rows with the same nonzero structure, thereby 3191 reusing matrix information to achieve increased efficiency. 3192 3193 Level: intermediate 3194 3195 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3196 @*/ 3197 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3198 { 3199 PetscErrorCode ierr; 3200 3201 PetscFunctionBegin; 3202 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3203 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3204 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3205 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3206 PetscFunctionReturn(0); 3207 } 3208 3209 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3210 { 3211 PetscErrorCode ierr; 3212 3213 PetscFunctionBegin; 3214 if (A->factortype == MAT_FACTOR_NONE) { 3215 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3216 } else { 3217 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3218 } 3219 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3220 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3221 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3222 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3223 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3224 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3225 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3226 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3227 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3228 PetscFunctionReturn(0); 3229 } 3230 3231 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3232 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3233 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3234 { 3235 PetscErrorCode ierr; 3236 3237 PetscFunctionBegin; 3238 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3239 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3240 PetscFunctionReturn(0); 3241 } 3242 3243 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3244 { 3245 PetscErrorCode ierr; 3246 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3247 Mat_SeqAIJCUSPARSE *cy; 3248 Mat_SeqAIJCUSPARSE *cx; 3249 PetscScalar *ay; 3250 const PetscScalar *ax; 3251 CsrMatrix *csry,*csrx; 3252 3253 PetscFunctionBegin; 3254 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3255 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3256 if (X->ops->axpy != Y->ops->axpy) { 3257 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3258 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3259 PetscFunctionReturn(0); 3260 } 3261 /* if we are here, it means both matrices are bound to GPU */ 3262 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3263 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3264 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3265 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3266 csry = (CsrMatrix*)cy->mat->mat; 3267 csrx = (CsrMatrix*)cx->mat->mat; 3268 /* see if we can turn this into a cublas axpy */ 3269 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3270 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3271 if (eq) { 3272 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3273 } 3274 if (eq) str = SAME_NONZERO_PATTERN; 3275 } 3276 /* spgeam is buggy with one column */ 3277 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3278 3279 if (str == SUBSET_NONZERO_PATTERN) { 3280 cusparseStatus_t stat; 3281 PetscScalar b = 1.0; 3282 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3283 size_t bufferSize; 3284 void *buffer; 3285 cudaError_t cerr; 3286 #endif 3287 3288 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3289 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3290 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3291 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3292 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3293 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3294 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3295 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3296 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3297 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3298 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3299 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3300 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3301 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3302 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3303 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3304 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3305 #else 3306 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3307 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3308 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3309 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3310 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3311 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3312 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3313 #endif 3314 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3315 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3316 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3317 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3318 } else if (str == SAME_NONZERO_PATTERN) { 3319 cublasHandle_t cublasv2handle; 3320 cublasStatus_t berr; 3321 PetscBLASInt one = 1, bnz = 1; 3322 3323 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3324 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3325 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3326 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3327 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3328 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3329 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3330 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3331 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3332 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3333 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3334 } else { 3335 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3336 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3337 } 3338 PetscFunctionReturn(0); 3339 } 3340 3341 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3342 { 3343 PetscErrorCode ierr; 3344 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3345 PetscScalar *ay; 3346 cublasHandle_t cublasv2handle; 3347 cublasStatus_t berr; 3348 PetscBLASInt one = 1, bnz = 1; 3349 3350 PetscFunctionBegin; 3351 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3352 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3353 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3354 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3355 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3356 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3357 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3358 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3359 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3360 PetscFunctionReturn(0); 3361 } 3362 3363 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3364 { 3365 PetscErrorCode ierr; 3366 PetscBool both = PETSC_FALSE; 3367 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3368 3369 PetscFunctionBegin; 3370 if (A->factortype == MAT_FACTOR_NONE) { 3371 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3372 if (spptr->mat) { 3373 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3374 if (matrix->values) { 3375 both = PETSC_TRUE; 3376 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3377 } 3378 } 3379 if (spptr->matTranspose) { 3380 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3381 if (matrix->values) { 3382 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3383 } 3384 } 3385 } 3386 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3387 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3388 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3389 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3390 else A->offloadmask = PETSC_OFFLOAD_CPU; 3391 PetscFunctionReturn(0); 3392 } 3393 3394 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3395 { 3396 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3397 PetscErrorCode ierr; 3398 3399 PetscFunctionBegin; 3400 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3401 if (flg) { 3402 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3403 3404 A->ops->scale = MatScale_SeqAIJ; 3405 A->ops->axpy = MatAXPY_SeqAIJ; 3406 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3407 A->ops->mult = MatMult_SeqAIJ; 3408 A->ops->multadd = MatMultAdd_SeqAIJ; 3409 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3410 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3411 A->ops->multhermitiantranspose = NULL; 3412 A->ops->multhermitiantransposeadd = NULL; 3413 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3414 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3415 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3416 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3417 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3418 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3419 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3420 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3421 } else { 3422 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3423 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3424 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3425 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3426 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3427 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3428 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3429 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3430 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3431 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3432 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3433 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3434 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3435 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3436 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3437 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3438 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3439 } 3440 A->boundtocpu = flg; 3441 a->inode.use = flg; 3442 PetscFunctionReturn(0); 3443 } 3444 3445 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3446 { 3447 PetscErrorCode ierr; 3448 cusparseStatus_t stat; 3449 Mat B; 3450 3451 PetscFunctionBegin; 3452 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3453 if (reuse == MAT_INITIAL_MATRIX) { 3454 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3455 } else if (reuse == MAT_REUSE_MATRIX) { 3456 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3457 } 3458 B = *newmat; 3459 3460 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3461 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3462 3463 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3464 if (B->factortype == MAT_FACTOR_NONE) { 3465 Mat_SeqAIJCUSPARSE *spptr; 3466 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3467 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3468 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3469 spptr->format = MAT_CUSPARSE_CSR; 3470 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3471 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3472 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3473 #else 3474 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3475 #endif 3476 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3477 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3478 #endif 3479 B->spptr = spptr; 3480 } else { 3481 Mat_SeqAIJCUSPARSETriFactors *spptr; 3482 3483 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3484 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3485 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3486 B->spptr = spptr; 3487 } 3488 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3489 } 3490 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3491 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3492 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3493 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3494 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3495 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3496 3497 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3498 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3499 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3500 PetscFunctionReturn(0); 3501 } 3502 3503 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3504 { 3505 PetscErrorCode ierr; 3506 3507 PetscFunctionBegin; 3508 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3509 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3510 PetscFunctionReturn(0); 3511 } 3512 3513 /*MC 3514 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3515 3516 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3517 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3518 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3519 3520 Options Database Keys: 3521 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3522 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3523 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3524 3525 Level: beginner 3526 3527 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3528 M*/ 3529 3530 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3531 3532 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3533 { 3534 PetscErrorCode ierr; 3535 3536 PetscFunctionBegin; 3537 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 3538 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3539 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3540 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3541 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3542 3543 PetscFunctionReturn(0); 3544 } 3545 3546 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3547 { 3548 PetscErrorCode ierr; 3549 cusparseStatus_t stat; 3550 3551 PetscFunctionBegin; 3552 if (*cusparsestruct) { 3553 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3554 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3555 delete (*cusparsestruct)->workVector; 3556 delete (*cusparsestruct)->rowoffsets_gpu; 3557 delete (*cusparsestruct)->cooPerm; 3558 delete (*cusparsestruct)->cooPerm_a; 3559 delete (*cusparsestruct)->csr2csc_i; 3560 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3561 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3562 } 3563 PetscFunctionReturn(0); 3564 } 3565 3566 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3567 { 3568 PetscFunctionBegin; 3569 if (*mat) { 3570 delete (*mat)->values; 3571 delete (*mat)->column_indices; 3572 delete (*mat)->row_offsets; 3573 delete *mat; 3574 *mat = 0; 3575 } 3576 PetscFunctionReturn(0); 3577 } 3578 3579 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3580 { 3581 cusparseStatus_t stat; 3582 PetscErrorCode ierr; 3583 3584 PetscFunctionBegin; 3585 if (*trifactor) { 3586 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3587 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3588 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3589 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3590 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3591 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3592 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3593 #endif 3594 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3595 } 3596 PetscFunctionReturn(0); 3597 } 3598 3599 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3600 { 3601 CsrMatrix *mat; 3602 cusparseStatus_t stat; 3603 cudaError_t err; 3604 3605 PetscFunctionBegin; 3606 if (*matstruct) { 3607 if ((*matstruct)->mat) { 3608 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3609 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3610 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3611 #else 3612 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3613 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3614 #endif 3615 } else { 3616 mat = (CsrMatrix*)(*matstruct)->mat; 3617 CsrMatrix_Destroy(&mat); 3618 } 3619 } 3620 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3621 delete (*matstruct)->cprowIndices; 3622 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3623 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3624 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3625 3626 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3627 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3628 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3629 for (int i=0; i<3; i++) { 3630 if (mdata->cuSpMV[i].initialized) { 3631 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3632 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3633 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3634 } 3635 } 3636 #endif 3637 delete *matstruct; 3638 *matstruct = NULL; 3639 } 3640 PetscFunctionReturn(0); 3641 } 3642 3643 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3644 { 3645 PetscErrorCode ierr; 3646 3647 PetscFunctionBegin; 3648 if (*trifactors) { 3649 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3650 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3651 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3652 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3653 delete (*trifactors)->rpermIndices; 3654 delete (*trifactors)->cpermIndices; 3655 delete (*trifactors)->workVector; 3656 (*trifactors)->rpermIndices = NULL; 3657 (*trifactors)->cpermIndices = NULL; 3658 (*trifactors)->workVector = NULL; 3659 if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3660 if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3661 (*trifactors)->init_dev_prop = PETSC_FALSE; 3662 } 3663 PetscFunctionReturn(0); 3664 } 3665 3666 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3667 { 3668 PetscErrorCode ierr; 3669 cusparseHandle_t handle; 3670 cusparseStatus_t stat; 3671 3672 PetscFunctionBegin; 3673 if (*trifactors) { 3674 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3675 if (handle = (*trifactors)->handle) { 3676 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3677 } 3678 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3679 } 3680 PetscFunctionReturn(0); 3681 } 3682 3683 struct IJCompare 3684 { 3685 __host__ __device__ 3686 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3687 { 3688 if (t1.get<0>() < t2.get<0>()) return true; 3689 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3690 return false; 3691 } 3692 }; 3693 3694 struct IJEqual 3695 { 3696 __host__ __device__ 3697 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3698 { 3699 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3700 return true; 3701 } 3702 }; 3703 3704 struct IJDiff 3705 { 3706 __host__ __device__ 3707 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3708 { 3709 return t1 == t2 ? 0 : 1; 3710 } 3711 }; 3712 3713 struct IJSum 3714 { 3715 __host__ __device__ 3716 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3717 { 3718 return t1||t2; 3719 } 3720 }; 3721 3722 #include <thrust/iterator/discard_iterator.h> 3723 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3724 { 3725 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3726 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3727 THRUSTARRAY *cooPerm_v = NULL; 3728 thrust::device_ptr<const PetscScalar> d_v; 3729 CsrMatrix *matrix; 3730 PetscErrorCode ierr; 3731 PetscInt n; 3732 3733 PetscFunctionBegin; 3734 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3735 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3736 if (!cusp->cooPerm) { 3737 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3738 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3739 PetscFunctionReturn(0); 3740 } 3741 matrix = (CsrMatrix*)cusp->mat->mat; 3742 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3743 if (!v) { 3744 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3745 goto finalize; 3746 } 3747 n = cusp->cooPerm->size(); 3748 if (isCudaMem(v)) { 3749 d_v = thrust::device_pointer_cast(v); 3750 } else { 3751 cooPerm_v = new THRUSTARRAY(n); 3752 cooPerm_v->assign(v,v+n); 3753 d_v = cooPerm_v->data(); 3754 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3755 } 3756 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3757 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3758 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3759 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3760 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3761 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3762 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3763 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3764 */ 3765 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3766 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3767 delete cooPerm_w; 3768 } else { 3769 /* all nonzeros in d_v[] are unique entries */ 3770 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3771 matrix->values->begin())); 3772 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3773 matrix->values->end())); 3774 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 3775 } 3776 } else { 3777 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3778 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3779 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3780 } else { 3781 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3782 matrix->values->begin())); 3783 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3784 matrix->values->end())); 3785 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3786 } 3787 } 3788 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3789 finalize: 3790 delete cooPerm_v; 3791 A->offloadmask = PETSC_OFFLOAD_GPU; 3792 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3793 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3794 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3795 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3796 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3797 a->reallocs = 0; 3798 A->info.mallocs += 0; 3799 A->info.nz_unneeded = 0; 3800 A->assembled = A->was_assembled = PETSC_TRUE; 3801 A->num_ass++; 3802 PetscFunctionReturn(0); 3803 } 3804 3805 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3806 { 3807 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3808 PetscErrorCode ierr; 3809 3810 PetscFunctionBegin; 3811 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3812 if (!cusp) PetscFunctionReturn(0); 3813 if (destroy) { 3814 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3815 delete cusp->csr2csc_i; 3816 cusp->csr2csc_i = NULL; 3817 } 3818 A->transupdated = PETSC_FALSE; 3819 PetscFunctionReturn(0); 3820 } 3821 3822 #include <thrust/binary_search.h> 3823 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3824 { 3825 PetscErrorCode ierr; 3826 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3827 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3828 PetscInt cooPerm_n, nzr = 0; 3829 cudaError_t cerr; 3830 3831 PetscFunctionBegin; 3832 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3833 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3834 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3835 if (n != cooPerm_n) { 3836 delete cusp->cooPerm; 3837 delete cusp->cooPerm_a; 3838 cusp->cooPerm = NULL; 3839 cusp->cooPerm_a = NULL; 3840 } 3841 if (n) { 3842 THRUSTINTARRAY d_i(n); 3843 THRUSTINTARRAY d_j(n); 3844 THRUSTINTARRAY ii(A->rmap->n); 3845 3846 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3847 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3848 3849 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3850 d_i.assign(coo_i,coo_i+n); 3851 d_j.assign(coo_j,coo_j+n); 3852 3853 /* Ex. 3854 n = 6 3855 coo_i = [3,3,1,4,1,4] 3856 coo_j = [3,2,2,5,2,6] 3857 */ 3858 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3859 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3860 3861 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3862 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3863 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 3864 *cusp->cooPerm_a = d_i; /* copy the sorted array */ 3865 THRUSTINTARRAY w = d_j; 3866 3867 /* 3868 d_i = [1,1,3,3,4,4] 3869 d_j = [2,2,2,3,5,6] 3870 cooPerm = [2,4,1,0,3,5] 3871 */ 3872 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 3873 3874 /* 3875 d_i = [1,3,3,4,4,x] 3876 ^ekey 3877 d_j = [2,2,3,5,6,x] 3878 ^nekye 3879 */ 3880 if (nekey == ekey) { /* all entries are unique */ 3881 delete cusp->cooPerm_a; 3882 cusp->cooPerm_a = NULL; 3883 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 3884 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 3885 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 3886 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 3887 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 3888 w[0] = 0; 3889 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 3890 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 3891 } 3892 thrust::counting_iterator<PetscInt> search_begin(0); 3893 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 3894 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 3895 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 3896 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3897 3898 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3899 a->singlemalloc = PETSC_FALSE; 3900 a->free_a = PETSC_TRUE; 3901 a->free_ij = PETSC_TRUE; 3902 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3903 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 3904 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3905 a->nz = a->maxnz = a->i[A->rmap->n]; 3906 a->rmax = 0; 3907 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3908 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3909 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3910 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3911 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3912 for (PetscInt i = 0; i < A->rmap->n; i++) { 3913 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3914 nzr += (PetscInt)!!(nnzr); 3915 a->ilen[i] = a->imax[i] = nnzr; 3916 a->rmax = PetscMax(a->rmax,nnzr); 3917 } 3918 a->nonzerorowcnt = nzr; 3919 A->preallocated = PETSC_TRUE; 3920 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3921 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3922 } else { 3923 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3924 } 3925 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3926 3927 /* We want to allocate the CUSPARSE struct for matvec now. 3928 The code is so convoluted now that I prefer to copy zeros */ 3929 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3930 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3931 A->offloadmask = PETSC_OFFLOAD_CPU; 3932 A->nonzerostate++; 3933 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3934 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 3935 3936 A->assembled = PETSC_FALSE; 3937 A->was_assembled = PETSC_FALSE; 3938 PetscFunctionReturn(0); 3939 } 3940 3941 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 3942 { 3943 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3944 CsrMatrix *csr; 3945 PetscErrorCode ierr; 3946 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3947 3948 PetscFunctionBegin; 3949 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3950 if (!i || !j) PetscFunctionReturn(0); 3951 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3952 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3953 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3954 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3955 csr = (CsrMatrix*)cusp->mat->mat; 3956 if (i) { 3957 if (!compressed && a->compressedrow.use) { /* need full row offset */ 3958 if (!cusp->rowoffsets_gpu) { 3959 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 3960 cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 3961 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3962 } 3963 *i = cusp->rowoffsets_gpu->data().get(); 3964 } else *i = csr->row_offsets->data().get(); 3965 } 3966 if (j) *j = csr->column_indices->data().get(); 3967 PetscFunctionReturn(0); 3968 } 3969 3970 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 3971 { 3972 PetscFunctionBegin; 3973 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3974 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3975 if (i) *i = NULL; 3976 if (j) *j = NULL; 3977 PetscFunctionReturn(0); 3978 } 3979 3980 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 3981 { 3982 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3983 CsrMatrix *csr; 3984 PetscErrorCode ierr; 3985 3986 PetscFunctionBegin; 3987 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3988 PetscValidPointer(a,2); 3989 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3990 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3991 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3992 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3993 csr = (CsrMatrix*)cusp->mat->mat; 3994 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3995 *a = csr->values->data().get(); 3996 PetscFunctionReturn(0); 3997 } 3998 3999 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4000 { 4001 PetscFunctionBegin; 4002 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4003 PetscValidPointer(a,2); 4004 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4005 *a = NULL; 4006 PetscFunctionReturn(0); 4007 } 4008 4009 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4010 { 4011 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4012 CsrMatrix *csr; 4013 PetscErrorCode ierr; 4014 4015 PetscFunctionBegin; 4016 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4017 PetscValidPointer(a,2); 4018 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4019 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4020 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4021 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4022 csr = (CsrMatrix*)cusp->mat->mat; 4023 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4024 *a = csr->values->data().get(); 4025 A->offloadmask = PETSC_OFFLOAD_GPU; 4026 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4027 PetscFunctionReturn(0); 4028 } 4029 4030 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4031 { 4032 PetscErrorCode ierr; 4033 4034 PetscFunctionBegin; 4035 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4036 PetscValidPointer(a,2); 4037 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4038 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4039 *a = NULL; 4040 PetscFunctionReturn(0); 4041 } 4042 4043 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4044 { 4045 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4046 CsrMatrix *csr; 4047 PetscErrorCode ierr; 4048 4049 PetscFunctionBegin; 4050 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4051 PetscValidPointer(a,2); 4052 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4053 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4054 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4055 csr = (CsrMatrix*)cusp->mat->mat; 4056 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4057 *a = csr->values->data().get(); 4058 A->offloadmask = PETSC_OFFLOAD_GPU; 4059 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4060 PetscFunctionReturn(0); 4061 } 4062 4063 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4064 { 4065 PetscErrorCode ierr; 4066 4067 PetscFunctionBegin; 4068 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4069 PetscValidPointer(a,2); 4070 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4071 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4072 *a = NULL; 4073 PetscFunctionReturn(0); 4074 } 4075 4076 struct IJCompare4 4077 { 4078 __host__ __device__ 4079 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4080 { 4081 if (t1.get<0>() < t2.get<0>()) return true; 4082 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4083 return false; 4084 } 4085 }; 4086 4087 struct Shift 4088 { 4089 int _shift; 4090 4091 Shift(int shift) : _shift(shift) {} 4092 __host__ __device__ 4093 inline int operator() (const int &c) 4094 { 4095 return c + _shift; 4096 } 4097 }; 4098 4099 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4100 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4101 { 4102 PetscErrorCode ierr; 4103 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4104 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4105 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4106 CsrMatrix *Acsr,*Bcsr,*Ccsr; 4107 PetscInt Annz,Bnnz; 4108 cusparseStatus_t stat; 4109 PetscInt i,m,n,zero = 0; 4110 cudaError_t cerr; 4111 4112 PetscFunctionBegin; 4113 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4114 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4115 PetscValidPointer(C,4); 4116 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4117 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 4118 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 4119 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 4120 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4121 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4122 if (reuse == MAT_INITIAL_MATRIX) { 4123 m = A->rmap->n; 4124 n = A->cmap->n + B->cmap->n; 4125 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 4126 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 4127 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 4128 c = (Mat_SeqAIJ*)(*C)->data; 4129 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4130 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4131 Ccsr = new CsrMatrix; 4132 Cmat->cprowIndices = NULL; 4133 c->compressedrow.use = PETSC_FALSE; 4134 c->compressedrow.nrows = 0; 4135 c->compressedrow.i = NULL; 4136 c->compressedrow.rindex = NULL; 4137 Ccusp->workVector = NULL; 4138 Ccusp->nrows = m; 4139 Ccusp->mat = Cmat; 4140 Ccusp->mat->mat = Ccsr; 4141 Ccsr->num_rows = m; 4142 Ccsr->num_cols = n; 4143 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 4144 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4145 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4146 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4147 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4148 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4149 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4150 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4151 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4152 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4153 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4154 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 4155 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 4156 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4157 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4158 4159 Acsr = (CsrMatrix*)Acusp->mat->mat; 4160 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4161 Annz = (PetscInt)Acsr->column_indices->size(); 4162 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4163 c->nz = Annz + Bnnz; 4164 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4165 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4166 Ccsr->values = new THRUSTARRAY(c->nz); 4167 Ccsr->num_entries = c->nz; 4168 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4169 if (c->nz) { 4170 auto Acoo = new THRUSTINTARRAY32(Annz); 4171 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4172 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4173 THRUSTINTARRAY32 *Aroff,*Broff; 4174 4175 if (a->compressedrow.use) { /* need full row offset */ 4176 if (!Acusp->rowoffsets_gpu) { 4177 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4178 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4179 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4180 } 4181 Aroff = Acusp->rowoffsets_gpu; 4182 } else Aroff = Acsr->row_offsets; 4183 if (b->compressedrow.use) { /* need full row offset */ 4184 if (!Bcusp->rowoffsets_gpu) { 4185 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4186 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4187 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4188 } 4189 Broff = Bcusp->rowoffsets_gpu; 4190 } else Broff = Bcsr->row_offsets; 4191 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4192 stat = cusparseXcsr2coo(Acusp->handle, 4193 Aroff->data().get(), 4194 Annz, 4195 m, 4196 Acoo->data().get(), 4197 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4198 stat = cusparseXcsr2coo(Bcusp->handle, 4199 Broff->data().get(), 4200 Bnnz, 4201 m, 4202 Bcoo->data().get(), 4203 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4204 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4205 auto Aperm = thrust::make_constant_iterator(1); 4206 auto Bperm = thrust::make_constant_iterator(0); 4207 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4208 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4209 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4210 #else 4211 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4212 auto Bcib = Bcsr->column_indices->begin(); 4213 auto Bcie = Bcsr->column_indices->end(); 4214 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4215 #endif 4216 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4217 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4218 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4219 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4220 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4221 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4222 auto p1 = Ccusp->cooPerm->begin(); 4223 auto p2 = Ccusp->cooPerm->begin(); 4224 thrust::advance(p2,Annz); 4225 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4226 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4227 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4228 #endif 4229 auto cci = thrust::make_counting_iterator(zero); 4230 auto cce = thrust::make_counting_iterator(c->nz); 4231 #if 0 //Errors on SUMMIT cuda 11.1.0 4232 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4233 #else 4234 auto pred = thrust::identity<int>(); 4235 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4236 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4237 #endif 4238 stat = cusparseXcoo2csr(Ccusp->handle, 4239 Ccoo->data().get(), 4240 c->nz, 4241 m, 4242 Ccsr->row_offsets->data().get(), 4243 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4244 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4245 delete wPerm; 4246 delete Acoo; 4247 delete Bcoo; 4248 delete Ccoo; 4249 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4250 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4251 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4252 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4253 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4254 #endif 4255 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4256 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4257 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4258 CsrMatrix *CcsrT = new CsrMatrix; 4259 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4260 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4261 4262 (*C)->form_explicit_transpose = PETSC_TRUE; 4263 (*C)->transupdated = PETSC_TRUE; 4264 Ccusp->rowoffsets_gpu = NULL; 4265 CmatT->cprowIndices = NULL; 4266 CmatT->mat = CcsrT; 4267 CcsrT->num_rows = n; 4268 CcsrT->num_cols = m; 4269 CcsrT->num_entries = c->nz; 4270 4271 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4272 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4273 CcsrT->values = new THRUSTARRAY(c->nz); 4274 4275 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4276 auto rT = CcsrT->row_offsets->begin(); 4277 if (AT) { 4278 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4279 thrust::advance(rT,-1); 4280 } 4281 if (BT) { 4282 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4283 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4284 thrust::copy(titb,tite,rT); 4285 } 4286 auto cT = CcsrT->column_indices->begin(); 4287 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4288 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4289 auto vT = CcsrT->values->begin(); 4290 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4291 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4292 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4293 4294 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4295 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4296 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4297 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4298 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4299 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4300 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4301 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4302 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4303 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4304 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4305 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4306 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4307 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4308 #endif 4309 Ccusp->matTranspose = CmatT; 4310 } 4311 } 4312 4313 c->singlemalloc = PETSC_FALSE; 4314 c->free_a = PETSC_TRUE; 4315 c->free_ij = PETSC_TRUE; 4316 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4317 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4318 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4319 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4320 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4321 ii = *Ccsr->row_offsets; 4322 jj = *Ccsr->column_indices; 4323 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4324 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4325 } else { 4326 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4327 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4328 } 4329 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4330 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4331 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4332 c->maxnz = c->nz; 4333 c->nonzerorowcnt = 0; 4334 c->rmax = 0; 4335 for (i = 0; i < m; i++) { 4336 const PetscInt nn = c->i[i+1] - c->i[i]; 4337 c->ilen[i] = c->imax[i] = nn; 4338 c->nonzerorowcnt += (PetscInt)!!nn; 4339 c->rmax = PetscMax(c->rmax,nn); 4340 } 4341 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4342 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4343 (*C)->nonzerostate++; 4344 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4345 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4346 Ccusp->nonzerostate = (*C)->nonzerostate; 4347 (*C)->preallocated = PETSC_TRUE; 4348 } else { 4349 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4350 c = (Mat_SeqAIJ*)(*C)->data; 4351 if (c->nz) { 4352 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4353 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4354 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4355 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4356 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4357 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4358 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4359 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4360 Acsr = (CsrMatrix*)Acusp->mat->mat; 4361 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4362 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4363 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4364 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4365 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4366 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4367 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4368 auto pmid = Ccusp->cooPerm->begin(); 4369 thrust::advance(pmid,Acsr->num_entries); 4370 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4371 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4372 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4373 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4374 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4375 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4376 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4377 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4378 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4379 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4380 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4381 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4382 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4383 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4384 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4385 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4386 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4387 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4388 auto vT = CcsrT->values->begin(); 4389 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4390 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4391 (*C)->transupdated = PETSC_TRUE; 4392 } 4393 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4394 } 4395 } 4396 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4397 (*C)->assembled = PETSC_TRUE; 4398 (*C)->was_assembled = PETSC_FALSE; 4399 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4400 PetscFunctionReturn(0); 4401 } 4402 4403 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4404 { 4405 PetscErrorCode ierr; 4406 bool dmem; 4407 const PetscScalar *av; 4408 cudaError_t cerr; 4409 4410 PetscFunctionBegin; 4411 dmem = isCudaMem(v); 4412 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4413 if (n && idx) { 4414 THRUSTINTARRAY widx(n); 4415 widx.assign(idx,idx+n); 4416 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4417 4418 THRUSTARRAY *w = NULL; 4419 thrust::device_ptr<PetscScalar> dv; 4420 if (dmem) { 4421 dv = thrust::device_pointer_cast(v); 4422 } else { 4423 w = new THRUSTARRAY(n); 4424 dv = w->data(); 4425 } 4426 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4427 4428 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4429 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4430 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4431 if (w) { 4432 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4433 } 4434 delete w; 4435 } else { 4436 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4437 } 4438 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4439 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4440 PetscFunctionReturn(0); 4441 } 4442