1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 #include <thrust/async/for_each.h> 17 18 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 19 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 20 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 21 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 22 23 typedef enum { 24 CUSPARSE_MV_ALG_DEFAULT = 0, 25 CUSPARSE_COOMV_ALG = 1, 26 CUSPARSE_CSRMV_ALG1 = 2, 27 CUSPARSE_CSRMV_ALG2 = 3 28 } cusparseSpMVAlg_t; 29 30 typedef enum { 31 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 32 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 33 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 34 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 35 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 36 CUSPARSE_SPMM_ALG_DEFAULT = 0, 37 CUSPARSE_SPMM_COO_ALG1 = 1, 38 CUSPARSE_SPMM_COO_ALG2 = 2, 39 CUSPARSE_SPMM_COO_ALG3 = 3, 40 CUSPARSE_SPMM_COO_ALG4 = 5, 41 CUSPARSE_SPMM_CSR_ALG1 = 4, 42 CUSPARSE_SPMM_CSR_ALG2 = 6, 43 } cusparseSpMMAlg_t; 44 45 typedef enum { 46 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 47 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 48 } cusparseCsr2CscAlg_t; 49 */ 50 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 51 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 52 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 53 #endif 54 55 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 57 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 58 59 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 61 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 64 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 66 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 67 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 68 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 69 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 70 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 71 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 72 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 73 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 74 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 75 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 76 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 77 78 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 80 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 81 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 82 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 83 84 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 85 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 86 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 87 88 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 89 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 90 91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 92 93 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 94 { 95 cusparseStatus_t stat; 96 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 97 98 PetscFunctionBegin; 99 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 100 cusparsestruct->stream = stream; 101 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 102 PetscFunctionReturn(0); 103 } 104 105 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 106 { 107 cusparseStatus_t stat; 108 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 109 110 PetscFunctionBegin; 111 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 112 if (cusparsestruct->handle != handle) { 113 if (cusparsestruct->handle) { 114 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 115 } 116 cusparsestruct->handle = handle; 117 } 118 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 119 PetscFunctionReturn(0); 120 } 121 122 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 123 { 124 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 125 PetscBool flg; 126 PetscErrorCode ierr; 127 128 PetscFunctionBegin; 129 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 130 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 131 if (cusparsestruct->handle) cusparsestruct->handle = 0; 132 PetscFunctionReturn(0); 133 } 134 135 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 136 { 137 PetscFunctionBegin; 138 *type = MATSOLVERCUSPARSE; 139 PetscFunctionReturn(0); 140 } 141 142 /*MC 143 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 144 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 145 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 146 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 147 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 148 algorithms are not recommended. This class does NOT support direct solver operations. 149 150 Level: beginner 151 152 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 153 M*/ 154 155 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 156 { 157 PetscErrorCode ierr; 158 PetscInt n = A->rmap->n; 159 160 PetscFunctionBegin; 161 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 162 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 163 (*B)->factortype = ftype; 164 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 165 166 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 167 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 168 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 169 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 170 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 171 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 172 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 173 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 174 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 175 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 176 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 177 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 178 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 179 180 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 181 (*B)->canuseordering = PETSC_TRUE; 182 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 183 PetscFunctionReturn(0); 184 } 185 186 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 187 { 188 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 189 190 PetscFunctionBegin; 191 switch (op) { 192 case MAT_CUSPARSE_MULT: 193 cusparsestruct->format = format; 194 break; 195 case MAT_CUSPARSE_ALL: 196 cusparsestruct->format = format; 197 break; 198 default: 199 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 200 } 201 PetscFunctionReturn(0); 202 } 203 204 /*@ 205 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 206 operation. Only the MatMult operation can use different GPU storage formats 207 for MPIAIJCUSPARSE matrices. 208 Not Collective 209 210 Input Parameters: 211 + A - Matrix of type SEQAIJCUSPARSE 212 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 213 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 214 215 Output Parameter: 216 217 Level: intermediate 218 219 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 220 @*/ 221 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 222 { 223 PetscErrorCode ierr; 224 225 PetscFunctionBegin; 226 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 227 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 228 PetscFunctionReturn(0); 229 } 230 231 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 232 { 233 PetscErrorCode ierr; 234 235 PetscFunctionBegin; 236 switch (op) { 237 case MAT_FORM_EXPLICIT_TRANSPOSE: 238 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 239 if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 240 A->form_explicit_transpose = flg; 241 break; 242 default: 243 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 244 break; 245 } 246 PetscFunctionReturn(0); 247 } 248 249 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 250 251 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 252 { 253 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 254 IS isrow = b->row,iscol = b->col; 255 PetscBool row_identity,col_identity; 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 260 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 261 B->offloadmask = PETSC_OFFLOAD_CPU; 262 /* determine which version of MatSolve needs to be used. */ 263 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 264 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 265 if (row_identity && col_identity) { 266 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 267 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 268 B->ops->matsolve = NULL; 269 B->ops->matsolvetranspose = NULL; 270 } else { 271 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 272 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 273 B->ops->matsolve = NULL; 274 B->ops->matsolvetranspose = NULL; 275 } 276 277 /* get the triangular factors */ 278 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 279 PetscFunctionReturn(0); 280 } 281 282 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 283 { 284 PetscErrorCode ierr; 285 MatCUSPARSEStorageFormat format; 286 PetscBool flg; 287 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 288 289 PetscFunctionBegin; 290 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 291 if (A->factortype == MAT_FACTOR_NONE) { 292 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 293 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 294 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 295 296 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 297 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 298 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 299 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 300 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 301 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 302 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 303 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 304 305 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 306 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 307 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 308 309 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 310 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 311 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 312 #endif 313 } 314 ierr = PetscOptionsTail();CHKERRQ(ierr); 315 PetscFunctionReturn(0); 316 } 317 318 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 319 { 320 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 321 PetscErrorCode ierr; 322 323 PetscFunctionBegin; 324 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 325 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 326 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 327 PetscFunctionReturn(0); 328 } 329 330 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 331 { 332 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 333 PetscErrorCode ierr; 334 335 PetscFunctionBegin; 336 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 337 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 338 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 339 PetscFunctionReturn(0); 340 } 341 342 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 343 { 344 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 345 PetscErrorCode ierr; 346 347 PetscFunctionBegin; 348 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 349 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 350 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 351 PetscFunctionReturn(0); 352 } 353 354 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 355 { 356 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 357 PetscErrorCode ierr; 358 359 PetscFunctionBegin; 360 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 361 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 362 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 363 PetscFunctionReturn(0); 364 } 365 366 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 367 { 368 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 369 PetscInt n = A->rmap->n; 370 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 371 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 372 cusparseStatus_t stat; 373 const PetscInt *ai = a->i,*aj = a->j,*vi; 374 const MatScalar *aa = a->a,*v; 375 PetscInt *AiLo, *AjLo; 376 PetscInt i,nz, nzLower, offset, rowOffset; 377 PetscErrorCode ierr; 378 cudaError_t cerr; 379 380 PetscFunctionBegin; 381 if (!n) PetscFunctionReturn(0); 382 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 383 try { 384 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 385 nzLower=n+ai[n]-ai[1]; 386 if (!loTriFactor) { 387 PetscScalar *AALo; 388 389 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 390 391 /* Allocate Space for the lower triangular matrix */ 392 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 393 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 394 395 /* Fill the lower triangular matrix */ 396 AiLo[0] = (PetscInt) 0; 397 AiLo[n] = nzLower; 398 AjLo[0] = (PetscInt) 0; 399 AALo[0] = (MatScalar) 1.0; 400 v = aa; 401 vi = aj; 402 offset = 1; 403 rowOffset= 1; 404 for (i=1; i<n; i++) { 405 nz = ai[i+1] - ai[i]; 406 /* additional 1 for the term on the diagonal */ 407 AiLo[i] = rowOffset; 408 rowOffset += nz+1; 409 410 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 411 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 412 413 offset += nz; 414 AjLo[offset] = (PetscInt) i; 415 AALo[offset] = (MatScalar) 1.0; 416 offset += 1; 417 418 v += nz; 419 vi += nz; 420 } 421 422 /* allocate space for the triangular factor information */ 423 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 424 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 425 /* Create the matrix description */ 426 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 427 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 428 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 429 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 430 #else 431 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 432 #endif 433 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 434 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 435 436 /* set the operation */ 437 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 438 439 /* set the matrix */ 440 loTriFactor->csrMat = new CsrMatrix; 441 loTriFactor->csrMat->num_rows = n; 442 loTriFactor->csrMat->num_cols = n; 443 loTriFactor->csrMat->num_entries = nzLower; 444 445 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 446 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 447 448 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 449 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 450 451 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 452 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 453 454 /* Create the solve analysis information */ 455 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 456 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 457 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 458 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 459 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 460 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 461 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 462 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 463 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 464 #endif 465 466 /* perform the solve analysis */ 467 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 468 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 469 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 470 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 471 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 472 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 473 #endif 474 );CHKERRCUSPARSE(stat); 475 cerr = WaitForCUDA();CHKERRCUDA(cerr); 476 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 477 478 /* assign the pointer */ 479 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 480 loTriFactor->AA_h = AALo; 481 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 482 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 483 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 484 } else { /* update values only */ 485 if (!loTriFactor->AA_h) { 486 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 487 } 488 /* Fill the lower triangular matrix */ 489 loTriFactor->AA_h[0] = 1.0; 490 v = aa; 491 vi = aj; 492 offset = 1; 493 for (i=1; i<n; i++) { 494 nz = ai[i+1] - ai[i]; 495 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 496 offset += nz; 497 loTriFactor->AA_h[offset] = 1.0; 498 offset += 1; 499 v += nz; 500 } 501 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 502 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 503 } 504 } catch(char *ex) { 505 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 506 } 507 } 508 PetscFunctionReturn(0); 509 } 510 511 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 512 { 513 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 514 PetscInt n = A->rmap->n; 515 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 516 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 517 cusparseStatus_t stat; 518 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 519 const MatScalar *aa = a->a,*v; 520 PetscInt *AiUp, *AjUp; 521 PetscInt i,nz, nzUpper, offset; 522 PetscErrorCode ierr; 523 cudaError_t cerr; 524 525 PetscFunctionBegin; 526 if (!n) PetscFunctionReturn(0); 527 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 528 try { 529 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 530 nzUpper = adiag[0]-adiag[n]; 531 if (!upTriFactor) { 532 PetscScalar *AAUp; 533 534 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 535 536 /* Allocate Space for the upper triangular matrix */ 537 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 538 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 539 540 /* Fill the upper triangular matrix */ 541 AiUp[0]=(PetscInt) 0; 542 AiUp[n]=nzUpper; 543 offset = nzUpper; 544 for (i=n-1; i>=0; i--) { 545 v = aa + adiag[i+1] + 1; 546 vi = aj + adiag[i+1] + 1; 547 548 /* number of elements NOT on the diagonal */ 549 nz = adiag[i] - adiag[i+1]-1; 550 551 /* decrement the offset */ 552 offset -= (nz+1); 553 554 /* first, set the diagonal elements */ 555 AjUp[offset] = (PetscInt) i; 556 AAUp[offset] = (MatScalar)1./v[nz]; 557 AiUp[i] = AiUp[i+1] - (nz+1); 558 559 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 560 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 561 } 562 563 /* allocate space for the triangular factor information */ 564 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 565 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 566 567 /* Create the matrix description */ 568 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 569 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 570 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 571 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 572 #else 573 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 574 #endif 575 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 576 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 577 578 /* set the operation */ 579 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 580 581 /* set the matrix */ 582 upTriFactor->csrMat = new CsrMatrix; 583 upTriFactor->csrMat->num_rows = n; 584 upTriFactor->csrMat->num_cols = n; 585 upTriFactor->csrMat->num_entries = nzUpper; 586 587 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 588 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 589 590 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 591 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 592 593 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 594 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 595 596 /* Create the solve analysis information */ 597 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 598 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 599 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 600 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 601 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 602 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 603 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 604 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 605 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 606 #endif 607 608 /* perform the solve analysis */ 609 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 610 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 611 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 612 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 613 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 614 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 615 #endif 616 );CHKERRCUSPARSE(stat); 617 cerr = WaitForCUDA();CHKERRCUDA(cerr); 618 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 619 620 /* assign the pointer */ 621 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 622 upTriFactor->AA_h = AAUp; 623 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 624 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 625 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 626 } else { 627 if (!upTriFactor->AA_h) { 628 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 629 } 630 /* Fill the upper triangular matrix */ 631 offset = nzUpper; 632 for (i=n-1; i>=0; i--) { 633 v = aa + adiag[i+1] + 1; 634 635 /* number of elements NOT on the diagonal */ 636 nz = adiag[i] - adiag[i+1]-1; 637 638 /* decrement the offset */ 639 offset -= (nz+1); 640 641 /* first, set the diagonal elements */ 642 upTriFactor->AA_h[offset] = 1./v[nz]; 643 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 644 } 645 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 646 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 647 } 648 } catch(char *ex) { 649 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 650 } 651 } 652 PetscFunctionReturn(0); 653 } 654 655 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 656 { 657 PetscErrorCode ierr; 658 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 659 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 660 IS isrow = a->row,iscol = a->icol; 661 PetscBool row_identity,col_identity; 662 PetscInt n = A->rmap->n; 663 664 PetscFunctionBegin; 665 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 666 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 667 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 668 669 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 670 cusparseTriFactors->nnz=a->nz; 671 672 A->offloadmask = PETSC_OFFLOAD_BOTH; 673 /* lower triangular indices */ 674 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 675 if (!row_identity && !cusparseTriFactors->rpermIndices) { 676 const PetscInt *r; 677 678 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 679 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 680 cusparseTriFactors->rpermIndices->assign(r, r+n); 681 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 682 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 683 } 684 685 /* upper triangular indices */ 686 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 687 if (!col_identity && !cusparseTriFactors->cpermIndices) { 688 const PetscInt *c; 689 690 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 691 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 692 cusparseTriFactors->cpermIndices->assign(c, c+n); 693 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 694 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 695 } 696 PetscFunctionReturn(0); 697 } 698 699 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 700 { 701 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 702 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 703 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 704 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 705 cusparseStatus_t stat; 706 PetscErrorCode ierr; 707 cudaError_t cerr; 708 PetscInt *AiUp, *AjUp; 709 PetscScalar *AAUp; 710 PetscScalar *AALo; 711 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 712 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 713 const PetscInt *ai = b->i,*aj = b->j,*vj; 714 const MatScalar *aa = b->a,*v; 715 716 PetscFunctionBegin; 717 if (!n) PetscFunctionReturn(0); 718 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 719 try { 720 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 721 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 722 if (!upTriFactor && !loTriFactor) { 723 /* Allocate Space for the upper triangular matrix */ 724 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 725 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 726 727 /* Fill the upper triangular matrix */ 728 AiUp[0]=(PetscInt) 0; 729 AiUp[n]=nzUpper; 730 offset = 0; 731 for (i=0; i<n; i++) { 732 /* set the pointers */ 733 v = aa + ai[i]; 734 vj = aj + ai[i]; 735 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 736 737 /* first, set the diagonal elements */ 738 AjUp[offset] = (PetscInt) i; 739 AAUp[offset] = (MatScalar)1.0/v[nz]; 740 AiUp[i] = offset; 741 AALo[offset] = (MatScalar)1.0/v[nz]; 742 743 offset+=1; 744 if (nz>0) { 745 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 746 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 747 for (j=offset; j<offset+nz; j++) { 748 AAUp[j] = -AAUp[j]; 749 AALo[j] = AAUp[j]/v[nz]; 750 } 751 offset+=nz; 752 } 753 } 754 755 /* allocate space for the triangular factor information */ 756 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 757 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 758 759 /* Create the matrix description */ 760 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 761 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 762 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 763 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 764 #else 765 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 766 #endif 767 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 768 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 769 770 /* set the matrix */ 771 upTriFactor->csrMat = new CsrMatrix; 772 upTriFactor->csrMat->num_rows = A->rmap->n; 773 upTriFactor->csrMat->num_cols = A->cmap->n; 774 upTriFactor->csrMat->num_entries = a->nz; 775 776 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 777 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 778 779 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 780 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 781 782 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 783 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 784 785 /* set the operation */ 786 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 787 788 /* Create the solve analysis information */ 789 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 790 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 791 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 792 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 793 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 794 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 795 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 796 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 797 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 798 #endif 799 800 /* perform the solve analysis */ 801 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 802 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 803 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 804 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 805 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 806 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 807 #endif 808 );CHKERRCUSPARSE(stat); 809 cerr = WaitForCUDA();CHKERRCUDA(cerr); 810 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 811 812 /* assign the pointer */ 813 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 814 815 /* allocate space for the triangular factor information */ 816 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 817 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 818 819 /* Create the matrix description */ 820 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 821 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 822 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 823 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 824 #else 825 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 826 #endif 827 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 828 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 829 830 /* set the operation */ 831 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 832 833 /* set the matrix */ 834 loTriFactor->csrMat = new CsrMatrix; 835 loTriFactor->csrMat->num_rows = A->rmap->n; 836 loTriFactor->csrMat->num_cols = A->cmap->n; 837 loTriFactor->csrMat->num_entries = a->nz; 838 839 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 840 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 841 842 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 843 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 844 845 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 846 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 847 848 /* Create the solve analysis information */ 849 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 850 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 851 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 852 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 853 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 854 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 855 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 856 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 857 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 858 #endif 859 860 /* perform the solve analysis */ 861 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 862 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 863 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 864 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 865 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 866 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 867 #endif 868 );CHKERRCUSPARSE(stat); 869 cerr = WaitForCUDA();CHKERRCUDA(cerr); 870 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 871 872 /* assign the pointer */ 873 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 874 875 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 876 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 877 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 878 } else { 879 /* Fill the upper triangular matrix */ 880 offset = 0; 881 for (i=0; i<n; i++) { 882 /* set the pointers */ 883 v = aa + ai[i]; 884 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 885 886 /* first, set the diagonal elements */ 887 AAUp[offset] = 1.0/v[nz]; 888 AALo[offset] = 1.0/v[nz]; 889 890 offset+=1; 891 if (nz>0) { 892 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 893 for (j=offset; j<offset+nz; j++) { 894 AAUp[j] = -AAUp[j]; 895 AALo[j] = AAUp[j]/v[nz]; 896 } 897 offset+=nz; 898 } 899 } 900 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 901 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 902 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 903 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 904 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 905 } 906 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 907 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 908 } catch(char *ex) { 909 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 910 } 911 } 912 PetscFunctionReturn(0); 913 } 914 915 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 916 { 917 PetscErrorCode ierr; 918 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 919 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 920 IS ip = a->row; 921 PetscBool perm_identity; 922 PetscInt n = A->rmap->n; 923 924 PetscFunctionBegin; 925 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 926 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 927 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 928 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 929 930 A->offloadmask = PETSC_OFFLOAD_BOTH; 931 932 /* lower triangular indices */ 933 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 934 if (!perm_identity) { 935 IS iip; 936 const PetscInt *irip,*rip; 937 938 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 939 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 940 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 941 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 942 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 943 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 944 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 945 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 946 ierr = ISDestroy(&iip);CHKERRQ(ierr); 947 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 948 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 949 } 950 PetscFunctionReturn(0); 951 } 952 953 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 954 { 955 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 956 IS ip = b->row; 957 PetscBool perm_identity; 958 PetscErrorCode ierr; 959 960 PetscFunctionBegin; 961 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 962 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 963 B->offloadmask = PETSC_OFFLOAD_CPU; 964 /* determine which version of MatSolve needs to be used. */ 965 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 966 if (perm_identity) { 967 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 968 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 969 B->ops->matsolve = NULL; 970 B->ops->matsolvetranspose = NULL; 971 } else { 972 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 973 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 974 B->ops->matsolve = NULL; 975 B->ops->matsolvetranspose = NULL; 976 } 977 978 /* get the triangular factors */ 979 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 980 PetscFunctionReturn(0); 981 } 982 983 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 984 { 985 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 986 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 987 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 988 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 989 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 990 cusparseStatus_t stat; 991 cusparseIndexBase_t indexBase; 992 cusparseMatrixType_t matrixType; 993 cusparseFillMode_t fillMode; 994 cusparseDiagType_t diagType; 995 cudaError_t cerr; 996 PetscErrorCode ierr; 997 998 PetscFunctionBegin; 999 /* allocate space for the transpose of the lower triangular factor */ 1000 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1001 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1002 1003 /* set the matrix descriptors of the lower triangular factor */ 1004 matrixType = cusparseGetMatType(loTriFactor->descr); 1005 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1006 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1007 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1008 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1009 1010 /* Create the matrix description */ 1011 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1012 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1013 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1014 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1015 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1016 1017 /* set the operation */ 1018 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1019 1020 /* allocate GPU space for the CSC of the lower triangular factor*/ 1021 loTriFactorT->csrMat = new CsrMatrix; 1022 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1023 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1024 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1025 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1026 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1027 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1028 1029 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1030 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1031 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1032 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1033 loTriFactor->csrMat->values->data().get(), 1034 loTriFactor->csrMat->row_offsets->data().get(), 1035 loTriFactor->csrMat->column_indices->data().get(), 1036 loTriFactorT->csrMat->values->data().get(), 1037 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1038 CUSPARSE_ACTION_NUMERIC,indexBase, 1039 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1040 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1041 #endif 1042 1043 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1044 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1045 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1046 loTriFactor->csrMat->values->data().get(), 1047 loTriFactor->csrMat->row_offsets->data().get(), 1048 loTriFactor->csrMat->column_indices->data().get(), 1049 loTriFactorT->csrMat->values->data().get(), 1050 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1051 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1052 CUSPARSE_ACTION_NUMERIC, indexBase, 1053 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 1054 #else 1055 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1056 CUSPARSE_ACTION_NUMERIC, indexBase 1057 #endif 1058 );CHKERRCUSPARSE(stat); 1059 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1060 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1061 1062 /* Create the solve analysis information */ 1063 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1064 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1065 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1066 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1067 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1068 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1069 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1070 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1071 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1072 #endif 1073 1074 /* perform the solve analysis */ 1075 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1076 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1077 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1078 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 1079 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1080 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1081 #endif 1082 );CHKERRCUSPARSE(stat); 1083 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1084 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1085 1086 /* assign the pointer */ 1087 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1088 1089 /*********************************************/ 1090 /* Now the Transpose of the Upper Tri Factor */ 1091 /*********************************************/ 1092 1093 /* allocate space for the transpose of the upper triangular factor */ 1094 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1095 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1096 1097 /* set the matrix descriptors of the upper triangular factor */ 1098 matrixType = cusparseGetMatType(upTriFactor->descr); 1099 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1100 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1101 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1102 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1103 1104 /* Create the matrix description */ 1105 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1106 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1107 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1108 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1109 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1110 1111 /* set the operation */ 1112 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1113 1114 /* allocate GPU space for the CSC of the upper triangular factor*/ 1115 upTriFactorT->csrMat = new CsrMatrix; 1116 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1117 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1118 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1119 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1120 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1121 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1122 1123 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1124 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1125 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1126 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1127 upTriFactor->csrMat->values->data().get(), 1128 upTriFactor->csrMat->row_offsets->data().get(), 1129 upTriFactor->csrMat->column_indices->data().get(), 1130 upTriFactorT->csrMat->values->data().get(), 1131 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1132 CUSPARSE_ACTION_NUMERIC,indexBase, 1133 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1134 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1135 #endif 1136 1137 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1138 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1139 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1140 upTriFactor->csrMat->values->data().get(), 1141 upTriFactor->csrMat->row_offsets->data().get(), 1142 upTriFactor->csrMat->column_indices->data().get(), 1143 upTriFactorT->csrMat->values->data().get(), 1144 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1145 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1146 CUSPARSE_ACTION_NUMERIC, indexBase, 1147 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1148 #else 1149 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1150 CUSPARSE_ACTION_NUMERIC, indexBase 1151 #endif 1152 );CHKERRCUSPARSE(stat); 1153 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1154 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1155 1156 /* Create the solve analysis information */ 1157 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1158 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1159 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1160 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1161 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1162 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1163 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1164 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1165 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1166 #endif 1167 1168 /* perform the solve analysis */ 1169 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1170 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1171 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1172 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1173 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1174 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1175 #endif 1176 );CHKERRCUSPARSE(stat); 1177 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1178 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1179 1180 /* assign the pointer */ 1181 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1182 PetscFunctionReturn(0); 1183 } 1184 1185 struct PetscScalarToPetscInt 1186 { 1187 __host__ __device__ 1188 PetscInt operator()(PetscScalar s) 1189 { 1190 return (PetscInt)PetscRealPart(s); 1191 } 1192 }; 1193 1194 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A) 1195 { 1196 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1197 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1198 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1199 cusparseStatus_t stat; 1200 cusparseIndexBase_t indexBase; 1201 cudaError_t err; 1202 PetscErrorCode ierr; 1203 1204 PetscFunctionBegin; 1205 if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 1206 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1207 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1208 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1209 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1210 if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1211 if (A->transupdated) PetscFunctionReturn(0); 1212 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1213 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1214 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1215 } 1216 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1217 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1218 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1219 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1220 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1221 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1222 1223 /* set alpha and beta */ 1224 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1225 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1226 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1227 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1228 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1229 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1230 1231 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1232 CsrMatrix *matrixT = new CsrMatrix; 1233 matstructT->mat = matrixT; 1234 matrixT->num_rows = A->cmap->n; 1235 matrixT->num_cols = A->rmap->n; 1236 matrixT->num_entries = a->nz; 1237 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1238 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1239 matrixT->values = new THRUSTARRAY(a->nz); 1240 1241 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1242 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1243 1244 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1245 stat = cusparseCreateCsr(&matstructT->matDescr, 1246 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1247 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1248 matrixT->values->data().get(), 1249 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1250 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1251 #endif 1252 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1253 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1254 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1255 #else 1256 CsrMatrix *temp = new CsrMatrix; 1257 CsrMatrix *tempT = new CsrMatrix; 1258 /* First convert HYB to CSR */ 1259 temp->num_rows = A->rmap->n; 1260 temp->num_cols = A->cmap->n; 1261 temp->num_entries = a->nz; 1262 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1263 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1264 temp->values = new THRUSTARRAY(a->nz); 1265 1266 stat = cusparse_hyb2csr(cusparsestruct->handle, 1267 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1268 temp->values->data().get(), 1269 temp->row_offsets->data().get(), 1270 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1271 1272 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1273 tempT->num_rows = A->rmap->n; 1274 tempT->num_cols = A->cmap->n; 1275 tempT->num_entries = a->nz; 1276 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1277 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1278 tempT->values = new THRUSTARRAY(a->nz); 1279 1280 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1281 temp->num_cols, temp->num_entries, 1282 temp->values->data().get(), 1283 temp->row_offsets->data().get(), 1284 temp->column_indices->data().get(), 1285 tempT->values->data().get(), 1286 tempT->column_indices->data().get(), 1287 tempT->row_offsets->data().get(), 1288 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1289 1290 /* Last, convert CSC to HYB */ 1291 cusparseHybMat_t hybMat; 1292 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1293 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1294 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1295 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1296 matstructT->descr, tempT->values->data().get(), 1297 tempT->row_offsets->data().get(), 1298 tempT->column_indices->data().get(), 1299 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1300 1301 /* assign the pointer */ 1302 matstructT->mat = hybMat; 1303 A->transupdated = PETSC_TRUE; 1304 /* delete temporaries */ 1305 if (tempT) { 1306 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1307 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1308 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1309 delete (CsrMatrix*) tempT; 1310 } 1311 if (temp) { 1312 if (temp->values) delete (THRUSTARRAY*) temp->values; 1313 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1314 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1315 delete (CsrMatrix*) temp; 1316 } 1317 #endif 1318 } 1319 } 1320 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1321 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1322 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1323 if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1324 if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1325 if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1326 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1327 if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1328 if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1329 if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1330 if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1331 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1332 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1333 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1334 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1335 } 1336 if (!cusparsestruct->csr2csc_i) { 1337 THRUSTARRAY csr2csc_a(matrix->num_entries); 1338 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1339 1340 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1341 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1342 void *csr2cscBuffer; 1343 size_t csr2cscBufferSize; 1344 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1345 A->cmap->n, matrix->num_entries, 1346 matrix->values->data().get(), 1347 cusparsestruct->rowoffsets_gpu->data().get(), 1348 matrix->column_indices->data().get(), 1349 matrixT->values->data().get(), 1350 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1351 CUSPARSE_ACTION_NUMERIC,indexBase, 1352 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1353 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1354 #endif 1355 1356 if (matrix->num_entries) { 1357 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1358 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1359 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1360 1361 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1362 should be filled with indexBase. So I just take a shortcut here. 1363 */ 1364 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1365 A->cmap->n,matrix->num_entries, 1366 csr2csc_a.data().get(), 1367 cusparsestruct->rowoffsets_gpu->data().get(), 1368 matrix->column_indices->data().get(), 1369 matrixT->values->data().get(), 1370 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1371 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1372 CUSPARSE_ACTION_NUMERIC,indexBase, 1373 cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1374 #else 1375 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1376 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1377 #endif 1378 } else { 1379 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1380 } 1381 1382 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1383 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1384 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1385 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1386 #endif 1387 } 1388 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1389 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1390 matrixT->values->begin())); 1391 } 1392 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1393 /* the compressed row indices is not used for matTranspose */ 1394 matstructT->cprowIndices = NULL; 1395 /* assign the pointer */ 1396 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1397 A->transupdated = PETSC_TRUE; 1398 PetscFunctionReturn(0); 1399 } 1400 1401 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1402 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1403 { 1404 PetscInt n = xx->map->n; 1405 const PetscScalar *barray; 1406 PetscScalar *xarray; 1407 thrust::device_ptr<const PetscScalar> bGPU; 1408 thrust::device_ptr<PetscScalar> xGPU; 1409 cusparseStatus_t stat; 1410 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1411 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1412 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1413 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1414 PetscErrorCode ierr; 1415 cudaError_t cerr; 1416 1417 PetscFunctionBegin; 1418 /* Analyze the matrix and create the transpose ... on the fly */ 1419 if (!loTriFactorT && !upTriFactorT) { 1420 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1421 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1422 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1423 } 1424 1425 /* Get the GPU pointers */ 1426 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1427 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1428 xGPU = thrust::device_pointer_cast(xarray); 1429 bGPU = thrust::device_pointer_cast(barray); 1430 1431 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1432 /* First, reorder with the row permutation */ 1433 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1434 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1435 xGPU); 1436 1437 /* First, solve U */ 1438 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1439 upTriFactorT->csrMat->num_rows, 1440 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1441 upTriFactorT->csrMat->num_entries, 1442 #endif 1443 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1444 upTriFactorT->csrMat->values->data().get(), 1445 upTriFactorT->csrMat->row_offsets->data().get(), 1446 upTriFactorT->csrMat->column_indices->data().get(), 1447 upTriFactorT->solveInfo, 1448 xarray, tempGPU->data().get() 1449 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1450 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1451 #endif 1452 );CHKERRCUSPARSE(stat); 1453 1454 /* Then, solve L */ 1455 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1456 loTriFactorT->csrMat->num_rows, 1457 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1458 loTriFactorT->csrMat->num_entries, 1459 #endif 1460 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1461 loTriFactorT->csrMat->values->data().get(), 1462 loTriFactorT->csrMat->row_offsets->data().get(), 1463 loTriFactorT->csrMat->column_indices->data().get(), 1464 loTriFactorT->solveInfo, 1465 tempGPU->data().get(), xarray 1466 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1467 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1468 #endif 1469 );CHKERRCUSPARSE(stat); 1470 1471 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1472 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1473 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1474 tempGPU->begin()); 1475 1476 /* Copy the temporary to the full solution. */ 1477 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1478 1479 /* restore */ 1480 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1481 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1482 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1483 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1484 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1485 PetscFunctionReturn(0); 1486 } 1487 1488 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1489 { 1490 const PetscScalar *barray; 1491 PetscScalar *xarray; 1492 cusparseStatus_t stat; 1493 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1494 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1495 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1496 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1497 PetscErrorCode ierr; 1498 cudaError_t cerr; 1499 1500 PetscFunctionBegin; 1501 /* Analyze the matrix and create the transpose ... on the fly */ 1502 if (!loTriFactorT && !upTriFactorT) { 1503 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1504 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1505 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1506 } 1507 1508 /* Get the GPU pointers */ 1509 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1510 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1511 1512 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1513 /* First, solve U */ 1514 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1515 upTriFactorT->csrMat->num_rows, 1516 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1517 upTriFactorT->csrMat->num_entries, 1518 #endif 1519 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1520 upTriFactorT->csrMat->values->data().get(), 1521 upTriFactorT->csrMat->row_offsets->data().get(), 1522 upTriFactorT->csrMat->column_indices->data().get(), 1523 upTriFactorT->solveInfo, 1524 barray, tempGPU->data().get() 1525 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1526 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1527 #endif 1528 );CHKERRCUSPARSE(stat); 1529 1530 /* Then, solve L */ 1531 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1532 loTriFactorT->csrMat->num_rows, 1533 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1534 loTriFactorT->csrMat->num_entries, 1535 #endif 1536 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1537 loTriFactorT->csrMat->values->data().get(), 1538 loTriFactorT->csrMat->row_offsets->data().get(), 1539 loTriFactorT->csrMat->column_indices->data().get(), 1540 loTriFactorT->solveInfo, 1541 tempGPU->data().get(), xarray 1542 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1543 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1544 #endif 1545 );CHKERRCUSPARSE(stat); 1546 1547 /* restore */ 1548 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1549 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1550 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1551 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1552 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1553 PetscFunctionReturn(0); 1554 } 1555 1556 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1557 { 1558 const PetscScalar *barray; 1559 PetscScalar *xarray; 1560 thrust::device_ptr<const PetscScalar> bGPU; 1561 thrust::device_ptr<PetscScalar> xGPU; 1562 cusparseStatus_t stat; 1563 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1564 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1565 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1566 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1567 PetscErrorCode ierr; 1568 cudaError_t cerr; 1569 1570 PetscFunctionBegin; 1571 1572 /* Get the GPU pointers */ 1573 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1574 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1575 xGPU = thrust::device_pointer_cast(xarray); 1576 bGPU = thrust::device_pointer_cast(barray); 1577 1578 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1579 /* First, reorder with the row permutation */ 1580 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1581 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1582 tempGPU->begin()); 1583 1584 /* Next, solve L */ 1585 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1586 loTriFactor->csrMat->num_rows, 1587 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1588 loTriFactor->csrMat->num_entries, 1589 #endif 1590 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1591 loTriFactor->csrMat->values->data().get(), 1592 loTriFactor->csrMat->row_offsets->data().get(), 1593 loTriFactor->csrMat->column_indices->data().get(), 1594 loTriFactor->solveInfo, 1595 tempGPU->data().get(), xarray 1596 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1597 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1598 #endif 1599 );CHKERRCUSPARSE(stat); 1600 1601 /* Then, solve U */ 1602 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1603 upTriFactor->csrMat->num_rows, 1604 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1605 upTriFactor->csrMat->num_entries, 1606 #endif 1607 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1608 upTriFactor->csrMat->values->data().get(), 1609 upTriFactor->csrMat->row_offsets->data().get(), 1610 upTriFactor->csrMat->column_indices->data().get(), 1611 upTriFactor->solveInfo, 1612 xarray, tempGPU->data().get() 1613 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1614 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1615 #endif 1616 );CHKERRCUSPARSE(stat); 1617 1618 /* Last, reorder with the column permutation */ 1619 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1620 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1621 xGPU); 1622 1623 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1624 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1625 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1626 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1627 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1628 PetscFunctionReturn(0); 1629 } 1630 1631 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1632 { 1633 const PetscScalar *barray; 1634 PetscScalar *xarray; 1635 cusparseStatus_t stat; 1636 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1637 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1638 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1639 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1640 PetscErrorCode ierr; 1641 cudaError_t cerr; 1642 1643 PetscFunctionBegin; 1644 /* Get the GPU pointers */ 1645 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1646 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1647 1648 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1649 /* First, solve L */ 1650 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1651 loTriFactor->csrMat->num_rows, 1652 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1653 loTriFactor->csrMat->num_entries, 1654 #endif 1655 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1656 loTriFactor->csrMat->values->data().get(), 1657 loTriFactor->csrMat->row_offsets->data().get(), 1658 loTriFactor->csrMat->column_indices->data().get(), 1659 loTriFactor->solveInfo, 1660 barray, tempGPU->data().get() 1661 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1662 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1663 #endif 1664 );CHKERRCUSPARSE(stat); 1665 1666 /* Next, solve U */ 1667 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1668 upTriFactor->csrMat->num_rows, 1669 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1670 upTriFactor->csrMat->num_entries, 1671 #endif 1672 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1673 upTriFactor->csrMat->values->data().get(), 1674 upTriFactor->csrMat->row_offsets->data().get(), 1675 upTriFactor->csrMat->column_indices->data().get(), 1676 upTriFactor->solveInfo, 1677 tempGPU->data().get(), xarray 1678 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1679 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1680 #endif 1681 );CHKERRCUSPARSE(stat); 1682 1683 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1684 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1685 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1686 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1687 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1688 PetscFunctionReturn(0); 1689 } 1690 1691 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1692 { 1693 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1694 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1695 cudaError_t cerr; 1696 PetscErrorCode ierr; 1697 1698 PetscFunctionBegin; 1699 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1700 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1701 1702 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1703 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1704 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1705 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1706 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1707 A->offloadmask = PETSC_OFFLOAD_BOTH; 1708 } 1709 PetscFunctionReturn(0); 1710 } 1711 1712 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1713 { 1714 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1715 PetscErrorCode ierr; 1716 1717 PetscFunctionBegin; 1718 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1719 *array = a->a; 1720 A->offloadmask = PETSC_OFFLOAD_CPU; 1721 PetscFunctionReturn(0); 1722 } 1723 1724 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1725 { 1726 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1727 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1728 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1729 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1730 PetscErrorCode ierr; 1731 cusparseStatus_t stat; 1732 PetscBool both = PETSC_TRUE; 1733 cudaError_t err; 1734 1735 PetscFunctionBegin; 1736 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1737 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1738 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1739 CsrMatrix *matrix; 1740 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1741 1742 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1743 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1744 matrix->values->assign(a->a, a->a+a->nz); 1745 err = WaitForCUDA();CHKERRCUDA(err); 1746 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1747 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1748 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1749 } else { 1750 PetscInt nnz; 1751 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1752 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1753 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1754 delete cusparsestruct->workVector; 1755 delete cusparsestruct->rowoffsets_gpu; 1756 cusparsestruct->workVector = NULL; 1757 cusparsestruct->rowoffsets_gpu = NULL; 1758 try { 1759 if (a->compressedrow.use) { 1760 m = a->compressedrow.nrows; 1761 ii = a->compressedrow.i; 1762 ridx = a->compressedrow.rindex; 1763 } else { 1764 m = A->rmap->n; 1765 ii = a->i; 1766 ridx = NULL; 1767 } 1768 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1769 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1770 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1771 else nnz = a->nz; 1772 1773 /* create cusparse matrix */ 1774 cusparsestruct->nrows = m; 1775 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1776 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1777 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1778 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1779 1780 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1781 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1782 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1783 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1784 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1785 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1786 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1787 1788 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1789 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1790 /* set the matrix */ 1791 CsrMatrix *mat= new CsrMatrix; 1792 mat->num_rows = m; 1793 mat->num_cols = A->cmap->n; 1794 mat->num_entries = nnz; 1795 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1796 mat->row_offsets->assign(ii, ii + m+1); 1797 1798 mat->column_indices = new THRUSTINTARRAY32(nnz); 1799 mat->column_indices->assign(a->j, a->j+nnz); 1800 1801 mat->values = new THRUSTARRAY(nnz); 1802 if (a->a) mat->values->assign(a->a, a->a+nnz); 1803 1804 /* assign the pointer */ 1805 matstruct->mat = mat; 1806 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1807 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1808 stat = cusparseCreateCsr(&matstruct->matDescr, 1809 mat->num_rows, mat->num_cols, mat->num_entries, 1810 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1811 mat->values->data().get(), 1812 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1813 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1814 } 1815 #endif 1816 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1817 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1818 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1819 #else 1820 CsrMatrix *mat= new CsrMatrix; 1821 mat->num_rows = m; 1822 mat->num_cols = A->cmap->n; 1823 mat->num_entries = nnz; 1824 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1825 mat->row_offsets->assign(ii, ii + m+1); 1826 1827 mat->column_indices = new THRUSTINTARRAY32(nnz); 1828 mat->column_indices->assign(a->j, a->j+nnz); 1829 1830 mat->values = new THRUSTARRAY(nnz); 1831 if (a->a) mat->values->assign(a->a, a->a+nnz); 1832 1833 cusparseHybMat_t hybMat; 1834 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1835 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1836 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1837 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1838 matstruct->descr, mat->values->data().get(), 1839 mat->row_offsets->data().get(), 1840 mat->column_indices->data().get(), 1841 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1842 /* assign the pointer */ 1843 matstruct->mat = hybMat; 1844 1845 if (mat) { 1846 if (mat->values) delete (THRUSTARRAY*)mat->values; 1847 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1848 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1849 delete (CsrMatrix*)mat; 1850 } 1851 #endif 1852 } 1853 1854 /* assign the compressed row indices */ 1855 if (a->compressedrow.use) { 1856 cusparsestruct->workVector = new THRUSTARRAY(m); 1857 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1858 matstruct->cprowIndices->assign(ridx,ridx+m); 1859 tmp = m; 1860 } else { 1861 cusparsestruct->workVector = NULL; 1862 matstruct->cprowIndices = NULL; 1863 tmp = 0; 1864 } 1865 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1866 1867 /* assign the pointer */ 1868 cusparsestruct->mat = matstruct; 1869 } catch(char *ex) { 1870 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1871 } 1872 err = WaitForCUDA();CHKERRCUDA(err); 1873 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1874 cusparsestruct->nonzerostate = A->nonzerostate; 1875 } 1876 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1877 } 1878 PetscFunctionReturn(0); 1879 } 1880 1881 struct VecCUDAPlusEquals 1882 { 1883 template <typename Tuple> 1884 __host__ __device__ 1885 void operator()(Tuple t) 1886 { 1887 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1888 } 1889 }; 1890 1891 struct VecCUDAEquals 1892 { 1893 template <typename Tuple> 1894 __host__ __device__ 1895 void operator()(Tuple t) 1896 { 1897 thrust::get<1>(t) = thrust::get<0>(t); 1898 } 1899 }; 1900 1901 struct VecCUDAEqualsReverse 1902 { 1903 template <typename Tuple> 1904 __host__ __device__ 1905 void operator()(Tuple t) 1906 { 1907 thrust::get<0>(t) = thrust::get<1>(t); 1908 } 1909 }; 1910 1911 struct MatMatCusparse { 1912 PetscBool cisdense; 1913 PetscScalar *Bt; 1914 Mat X; 1915 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1916 PetscLogDouble flops; 1917 CsrMatrix *Bcsr; 1918 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1919 cusparseSpMatDescr_t matSpBDescr; 1920 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1921 cusparseDnMatDescr_t matBDescr; 1922 cusparseDnMatDescr_t matCDescr; 1923 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1924 size_t mmBufferSize; 1925 void *mmBuffer; 1926 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1927 cusparseSpGEMMDescr_t spgemmDesc; 1928 #endif 1929 }; 1930 1931 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1932 { 1933 PetscErrorCode ierr; 1934 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1935 cudaError_t cerr; 1936 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1937 cusparseStatus_t stat; 1938 #endif 1939 1940 PetscFunctionBegin; 1941 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1942 delete mmdata->Bcsr; 1943 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1944 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1945 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1946 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1947 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1948 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1949 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1950 #endif 1951 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1952 ierr = PetscFree(data);CHKERRQ(ierr); 1953 PetscFunctionReturn(0); 1954 } 1955 1956 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1957 1958 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1959 { 1960 Mat_Product *product = C->product; 1961 Mat A,B; 1962 PetscInt m,n,blda,clda; 1963 PetscBool flg,biscuda; 1964 Mat_SeqAIJCUSPARSE *cusp; 1965 cusparseStatus_t stat; 1966 cusparseOperation_t opA; 1967 const PetscScalar *barray; 1968 PetscScalar *carray; 1969 PetscErrorCode ierr; 1970 MatMatCusparse *mmdata; 1971 Mat_SeqAIJCUSPARSEMultStruct *mat; 1972 CsrMatrix *csrmat; 1973 cudaError_t cerr; 1974 1975 PetscFunctionBegin; 1976 MatCheckProduct(C,1); 1977 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 1978 mmdata = (MatMatCusparse*)product->data; 1979 A = product->A; 1980 B = product->B; 1981 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1982 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 1983 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1984 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1985 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1986 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1987 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1988 switch (product->type) { 1989 case MATPRODUCT_AB: 1990 case MATPRODUCT_PtAP: 1991 mat = cusp->mat; 1992 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1993 m = A->rmap->n; 1994 n = B->cmap->n; 1995 break; 1996 case MATPRODUCT_AtB: 1997 if (!A->form_explicit_transpose) { 1998 mat = cusp->mat; 1999 opA = CUSPARSE_OPERATION_TRANSPOSE; 2000 } else { 2001 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2002 mat = cusp->matTranspose; 2003 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2004 } 2005 m = A->cmap->n; 2006 n = B->cmap->n; 2007 break; 2008 case MATPRODUCT_ABt: 2009 case MATPRODUCT_RARt: 2010 mat = cusp->mat; 2011 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2012 m = A->rmap->n; 2013 n = B->rmap->n; 2014 break; 2015 default: 2016 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2017 } 2018 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2019 csrmat = (CsrMatrix*)mat->mat; 2020 /* if the user passed a CPU matrix, copy the data to the GPU */ 2021 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2022 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2023 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2024 2025 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2026 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2027 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2028 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2029 } else { 2030 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2031 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2032 } 2033 2034 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2035 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2036 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2037 /* (re)allcoate mmBuffer if not initialized or LDAs are different */ 2038 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2039 size_t mmBufferSize; 2040 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2041 if (!mmdata->matBDescr) { 2042 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2043 mmdata->Blda = blda; 2044 } 2045 2046 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2047 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2048 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2049 mmdata->Clda = clda; 2050 } 2051 2052 if (!mat->matDescr) { 2053 stat = cusparseCreateCsr(&mat->matDescr, 2054 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2055 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2056 csrmat->values->data().get(), 2057 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2058 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2059 } 2060 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2061 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2062 mmdata->matCDescr,cusparse_scalartype, 2063 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2064 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2065 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2066 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2067 mmdata->mmBufferSize = mmBufferSize; 2068 } 2069 mmdata->initialized = PETSC_TRUE; 2070 } else { 2071 /* to be safe, always update pointers of the mats */ 2072 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2073 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2074 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2075 } 2076 2077 /* do cusparseSpMM, which supports transpose on B */ 2078 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2079 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2080 mmdata->matCDescr,cusparse_scalartype, 2081 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2082 #else 2083 PetscInt k; 2084 /* cusparseXcsrmm does not support transpose on B */ 2085 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2086 cublasHandle_t cublasv2handle; 2087 cublasStatus_t cerr; 2088 2089 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2090 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2091 B->cmap->n,B->rmap->n, 2092 &PETSC_CUSPARSE_ONE ,barray,blda, 2093 &PETSC_CUSPARSE_ZERO,barray,blda, 2094 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2095 blda = B->cmap->n; 2096 k = B->cmap->n; 2097 } else { 2098 k = B->rmap->n; 2099 } 2100 2101 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2102 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2103 csrmat->num_entries,mat->alpha_one,mat->descr, 2104 csrmat->values->data().get(), 2105 csrmat->row_offsets->data().get(), 2106 csrmat->column_indices->data().get(), 2107 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2108 carray,clda);CHKERRCUSPARSE(stat); 2109 #endif 2110 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2111 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2112 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2113 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2114 if (product->type == MATPRODUCT_RARt) { 2115 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2116 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2117 } else if (product->type == MATPRODUCT_PtAP) { 2118 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2119 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2120 } else { 2121 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2122 } 2123 if (mmdata->cisdense) { 2124 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2125 } 2126 if (!biscuda) { 2127 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2128 } 2129 PetscFunctionReturn(0); 2130 } 2131 2132 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2133 { 2134 Mat_Product *product = C->product; 2135 Mat A,B; 2136 PetscInt m,n; 2137 PetscBool cisdense,flg; 2138 PetscErrorCode ierr; 2139 MatMatCusparse *mmdata; 2140 Mat_SeqAIJCUSPARSE *cusp; 2141 2142 PetscFunctionBegin; 2143 MatCheckProduct(C,1); 2144 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2145 A = product->A; 2146 B = product->B; 2147 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2148 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2149 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2150 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2151 switch (product->type) { 2152 case MATPRODUCT_AB: 2153 m = A->rmap->n; 2154 n = B->cmap->n; 2155 break; 2156 case MATPRODUCT_AtB: 2157 m = A->cmap->n; 2158 n = B->cmap->n; 2159 break; 2160 case MATPRODUCT_ABt: 2161 m = A->rmap->n; 2162 n = B->rmap->n; 2163 break; 2164 case MATPRODUCT_PtAP: 2165 m = B->cmap->n; 2166 n = B->cmap->n; 2167 break; 2168 case MATPRODUCT_RARt: 2169 m = B->rmap->n; 2170 n = B->rmap->n; 2171 break; 2172 default: 2173 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2174 } 2175 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2176 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2177 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2178 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2179 2180 /* product data */ 2181 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2182 mmdata->cisdense = cisdense; 2183 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2184 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2185 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2186 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2187 } 2188 #endif 2189 /* for these products we need intermediate storage */ 2190 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2191 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2192 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2193 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2194 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2195 } else { 2196 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2197 } 2198 } 2199 C->product->data = mmdata; 2200 C->product->destroy = MatDestroy_MatMatCusparse; 2201 2202 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2203 PetscFunctionReturn(0); 2204 } 2205 2206 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2207 { 2208 Mat_Product *product = C->product; 2209 Mat A,B; 2210 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2211 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2212 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2213 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2214 PetscBool flg; 2215 PetscErrorCode ierr; 2216 cusparseStatus_t stat; 2217 cudaError_t cerr; 2218 MatProductType ptype; 2219 MatMatCusparse *mmdata; 2220 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2221 cusparseSpMatDescr_t BmatSpDescr; 2222 #endif 2223 2224 PetscFunctionBegin; 2225 MatCheckProduct(C,1); 2226 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2227 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2228 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2229 mmdata = (MatMatCusparse*)C->product->data; 2230 A = product->A; 2231 B = product->B; 2232 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2233 mmdata->reusesym = PETSC_FALSE; 2234 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2235 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2236 Cmat = Ccusp->mat; 2237 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2238 Ccsr = (CsrMatrix*)Cmat->mat; 2239 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2240 goto finalize; 2241 } 2242 if (!c->nz) goto finalize; 2243 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2244 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2245 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2246 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2247 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2248 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2249 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2250 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2251 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2252 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2253 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2254 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2255 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2256 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2257 2258 ptype = product->type; 2259 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2260 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2261 switch (ptype) { 2262 case MATPRODUCT_AB: 2263 Amat = Acusp->mat; 2264 Bmat = Bcusp->mat; 2265 break; 2266 case MATPRODUCT_AtB: 2267 Amat = Acusp->matTranspose; 2268 Bmat = Bcusp->mat; 2269 break; 2270 case MATPRODUCT_ABt: 2271 Amat = Acusp->mat; 2272 Bmat = Bcusp->matTranspose; 2273 break; 2274 default: 2275 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2276 } 2277 Cmat = Ccusp->mat; 2278 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2279 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2280 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2281 Acsr = (CsrMatrix*)Amat->mat; 2282 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2283 Ccsr = (CsrMatrix*)Cmat->mat; 2284 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2285 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2286 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2287 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2288 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2289 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2290 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2291 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2292 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2293 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2294 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2295 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2296 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2297 #else 2298 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2299 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2300 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2301 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2302 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2303 #endif 2304 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2305 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2306 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2307 C->offloadmask = PETSC_OFFLOAD_GPU; 2308 finalize: 2309 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2310 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2311 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2312 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2313 c->reallocs = 0; 2314 C->info.mallocs += 0; 2315 C->info.nz_unneeded = 0; 2316 C->assembled = C->was_assembled = PETSC_TRUE; 2317 C->num_ass++; 2318 PetscFunctionReturn(0); 2319 } 2320 2321 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2322 { 2323 Mat_Product *product = C->product; 2324 Mat A,B; 2325 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2326 Mat_SeqAIJ *a,*b,*c; 2327 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2328 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2329 PetscInt i,j,m,n,k; 2330 PetscBool flg; 2331 PetscErrorCode ierr; 2332 cusparseStatus_t stat; 2333 cudaError_t cerr; 2334 MatProductType ptype; 2335 MatMatCusparse *mmdata; 2336 PetscLogDouble flops; 2337 PetscBool biscompressed,ciscompressed; 2338 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2339 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2340 size_t bufSize2; 2341 cusparseSpMatDescr_t BmatSpDescr; 2342 #else 2343 int cnz; 2344 #endif 2345 2346 PetscFunctionBegin; 2347 MatCheckProduct(C,1); 2348 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2349 A = product->A; 2350 B = product->B; 2351 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2352 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2353 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2354 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2355 a = (Mat_SeqAIJ*)A->data; 2356 b = (Mat_SeqAIJ*)B->data; 2357 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2358 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2359 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2360 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2361 2362 /* product data */ 2363 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2364 C->product->data = mmdata; 2365 C->product->destroy = MatDestroy_MatMatCusparse; 2366 2367 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2368 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2369 ptype = product->type; 2370 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2371 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2372 biscompressed = PETSC_FALSE; 2373 ciscompressed = PETSC_FALSE; 2374 switch (ptype) { 2375 case MATPRODUCT_AB: 2376 m = A->rmap->n; 2377 n = B->cmap->n; 2378 k = A->cmap->n; 2379 Amat = Acusp->mat; 2380 Bmat = Bcusp->mat; 2381 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2382 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2383 break; 2384 case MATPRODUCT_AtB: 2385 m = A->cmap->n; 2386 n = B->cmap->n; 2387 k = A->rmap->n; 2388 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2389 Amat = Acusp->matTranspose; 2390 Bmat = Bcusp->mat; 2391 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2392 break; 2393 case MATPRODUCT_ABt: 2394 m = A->rmap->n; 2395 n = B->rmap->n; 2396 k = A->cmap->n; 2397 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 2398 Amat = Acusp->mat; 2399 Bmat = Bcusp->matTranspose; 2400 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2401 break; 2402 default: 2403 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2404 } 2405 2406 /* create cusparse matrix */ 2407 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2408 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2409 c = (Mat_SeqAIJ*)C->data; 2410 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2411 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2412 Ccsr = new CsrMatrix; 2413 2414 c->compressedrow.use = ciscompressed; 2415 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2416 c->compressedrow.nrows = a->compressedrow.nrows; 2417 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2418 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2419 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2420 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2421 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2422 } else { 2423 c->compressedrow.nrows = 0; 2424 c->compressedrow.i = NULL; 2425 c->compressedrow.rindex = NULL; 2426 Ccusp->workVector = NULL; 2427 Cmat->cprowIndices = NULL; 2428 } 2429 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2430 Ccusp->mat = Cmat; 2431 Ccusp->mat->mat = Ccsr; 2432 Ccsr->num_rows = Ccusp->nrows; 2433 Ccsr->num_cols = n; 2434 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2435 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2436 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2437 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2438 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2439 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2440 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2441 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2442 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2443 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2444 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2445 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2446 c->nz = 0; 2447 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2448 Ccsr->values = new THRUSTARRAY(c->nz); 2449 goto finalizesym; 2450 } 2451 2452 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2453 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2454 Acsr = (CsrMatrix*)Amat->mat; 2455 if (!biscompressed) { 2456 Bcsr = (CsrMatrix*)Bmat->mat; 2457 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2458 BmatSpDescr = Bmat->matDescr; 2459 #endif 2460 } else { /* we need to use row offsets for the full matrix */ 2461 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2462 Bcsr = new CsrMatrix; 2463 Bcsr->num_rows = B->rmap->n; 2464 Bcsr->num_cols = cBcsr->num_cols; 2465 Bcsr->num_entries = cBcsr->num_entries; 2466 Bcsr->column_indices = cBcsr->column_indices; 2467 Bcsr->values = cBcsr->values; 2468 if (!Bcusp->rowoffsets_gpu) { 2469 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2470 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2471 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2472 } 2473 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2474 mmdata->Bcsr = Bcsr; 2475 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2476 if (Bcsr->num_rows && Bcsr->num_cols) { 2477 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2478 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2479 Bcsr->values->data().get(), 2480 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2481 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2482 } 2483 BmatSpDescr = mmdata->matSpBDescr; 2484 #endif 2485 } 2486 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2487 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2488 /* precompute flops count */ 2489 if (ptype == MATPRODUCT_AB) { 2490 for (i=0, flops = 0; i<A->rmap->n; i++) { 2491 const PetscInt st = a->i[i]; 2492 const PetscInt en = a->i[i+1]; 2493 for (j=st; j<en; j++) { 2494 const PetscInt brow = a->j[j]; 2495 flops += 2.*(b->i[brow+1] - b->i[brow]); 2496 } 2497 } 2498 } else if (ptype == MATPRODUCT_AtB) { 2499 for (i=0, flops = 0; i<A->rmap->n; i++) { 2500 const PetscInt anzi = a->i[i+1] - a->i[i]; 2501 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2502 flops += (2.*anzi)*bnzi; 2503 } 2504 } else { /* TODO */ 2505 flops = 0.; 2506 } 2507 2508 mmdata->flops = flops; 2509 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2510 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2511 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2512 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2513 NULL, NULL, NULL, 2514 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2515 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2516 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2517 /* ask bufferSize bytes for external memory */ 2518 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2519 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2520 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2521 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2522 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2523 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2524 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2525 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2526 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2527 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2528 /* ask bufferSize again bytes for external memory */ 2529 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2530 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2531 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2532 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2533 /* The CUSPARSE documentation is not clear, nor the API 2534 We need both buffers to perform the operations properly! 2535 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2536 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2537 is stored in the descriptor! What a messy API... */ 2538 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2539 /* compute the intermediate product of A * B */ 2540 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2541 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2542 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2543 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2544 /* get matrix C non-zero entries C_nnz1 */ 2545 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2546 c->nz = (PetscInt) C_nnz1; 2547 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2548 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2549 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2550 Ccsr->values = new THRUSTARRAY(c->nz); 2551 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2552 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2553 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2554 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2555 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2556 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2557 #else 2558 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2559 stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2560 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2561 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2562 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2563 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2564 c->nz = cnz; 2565 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2566 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2567 Ccsr->values = new THRUSTARRAY(c->nz); 2568 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2569 2570 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2571 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2572 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2573 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2574 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2575 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2576 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2577 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2578 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2579 #endif 2580 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2581 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2582 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2583 finalizesym: 2584 c->singlemalloc = PETSC_FALSE; 2585 c->free_a = PETSC_TRUE; 2586 c->free_ij = PETSC_TRUE; 2587 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2588 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2589 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2590 PetscInt *d_i = c->i; 2591 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2592 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2593 ii = *Ccsr->row_offsets; 2594 jj = *Ccsr->column_indices; 2595 if (ciscompressed) d_i = c->compressedrow.i; 2596 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2597 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2598 } else { 2599 PetscInt *d_i = c->i; 2600 if (ciscompressed) d_i = c->compressedrow.i; 2601 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2602 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2603 } 2604 if (ciscompressed) { /* need to expand host row offsets */ 2605 PetscInt r = 0; 2606 c->i[0] = 0; 2607 for (k = 0; k < c->compressedrow.nrows; k++) { 2608 const PetscInt next = c->compressedrow.rindex[k]; 2609 const PetscInt old = c->compressedrow.i[k]; 2610 for (; r < next; r++) c->i[r+1] = old; 2611 } 2612 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2613 } 2614 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2615 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2616 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2617 c->maxnz = c->nz; 2618 c->nonzerorowcnt = 0; 2619 c->rmax = 0; 2620 for (k = 0; k < m; k++) { 2621 const PetscInt nn = c->i[k+1] - c->i[k]; 2622 c->ilen[k] = c->imax[k] = nn; 2623 c->nonzerorowcnt += (PetscInt)!!nn; 2624 c->rmax = PetscMax(c->rmax,nn); 2625 } 2626 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2627 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2628 Ccsr->num_entries = c->nz; 2629 2630 C->nonzerostate++; 2631 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2632 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2633 Ccusp->nonzerostate = C->nonzerostate; 2634 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2635 C->preallocated = PETSC_TRUE; 2636 C->assembled = PETSC_FALSE; 2637 C->was_assembled = PETSC_FALSE; 2638 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2639 mmdata->reusesym = PETSC_TRUE; 2640 C->offloadmask = PETSC_OFFLOAD_GPU; 2641 } 2642 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2643 PetscFunctionReturn(0); 2644 } 2645 2646 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2647 2648 /* handles sparse or dense B */ 2649 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2650 { 2651 Mat_Product *product = mat->product; 2652 PetscErrorCode ierr; 2653 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2654 2655 PetscFunctionBegin; 2656 MatCheckProduct(mat,1); 2657 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2658 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2659 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2660 } 2661 if (product->type == MATPRODUCT_ABC) { 2662 Ciscusp = PETSC_FALSE; 2663 if (!product->C->boundtocpu) { 2664 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2665 } 2666 } 2667 if (isdense) { 2668 switch (product->type) { 2669 case MATPRODUCT_AB: 2670 case MATPRODUCT_AtB: 2671 case MATPRODUCT_ABt: 2672 case MATPRODUCT_PtAP: 2673 case MATPRODUCT_RARt: 2674 if (product->A->boundtocpu) { 2675 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2676 } else { 2677 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2678 } 2679 break; 2680 case MATPRODUCT_ABC: 2681 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2682 break; 2683 default: 2684 break; 2685 } 2686 } else if (Biscusp && Ciscusp) { 2687 switch (product->type) { 2688 case MATPRODUCT_AB: 2689 case MATPRODUCT_AtB: 2690 case MATPRODUCT_ABt: 2691 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2692 break; 2693 case MATPRODUCT_PtAP: 2694 case MATPRODUCT_RARt: 2695 case MATPRODUCT_ABC: 2696 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2697 break; 2698 default: 2699 break; 2700 } 2701 } else { /* fallback for AIJ */ 2702 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2703 } 2704 PetscFunctionReturn(0); 2705 } 2706 2707 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2708 { 2709 PetscErrorCode ierr; 2710 2711 PetscFunctionBegin; 2712 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2713 PetscFunctionReturn(0); 2714 } 2715 2716 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2717 { 2718 PetscErrorCode ierr; 2719 2720 PetscFunctionBegin; 2721 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2722 PetscFunctionReturn(0); 2723 } 2724 2725 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2726 { 2727 PetscErrorCode ierr; 2728 2729 PetscFunctionBegin; 2730 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2731 PetscFunctionReturn(0); 2732 } 2733 2734 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2735 { 2736 PetscErrorCode ierr; 2737 2738 PetscFunctionBegin; 2739 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2740 PetscFunctionReturn(0); 2741 } 2742 2743 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2744 { 2745 PetscErrorCode ierr; 2746 2747 PetscFunctionBegin; 2748 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2749 PetscFunctionReturn(0); 2750 } 2751 2752 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2753 { 2754 int i = blockIdx.x*blockDim.x + threadIdx.x; 2755 if (i < n) y[idx[i]] += x[i]; 2756 } 2757 2758 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2759 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2760 { 2761 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2762 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2763 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2764 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2765 PetscErrorCode ierr; 2766 cudaError_t cerr; 2767 cusparseStatus_t stat; 2768 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2769 PetscBool compressed; 2770 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2771 PetscInt nx,ny; 2772 #endif 2773 2774 PetscFunctionBegin; 2775 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 2776 if (!a->nonzerorowcnt) { 2777 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2778 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2779 PetscFunctionReturn(0); 2780 } 2781 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2782 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2783 if (!trans) { 2784 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2785 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2786 } else { 2787 if (herm || !A->form_explicit_transpose) { 2788 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2789 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2790 } else { 2791 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);} 2792 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2793 } 2794 } 2795 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2796 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2797 2798 try { 2799 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2800 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2801 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2802 2803 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2804 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2805 /* z = A x + beta y. 2806 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2807 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2808 */ 2809 xptr = xarray; 2810 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2811 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2812 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2813 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2814 allocated to accommodate different uses. So we get the length info directly from mat. 2815 */ 2816 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2817 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2818 nx = mat->num_cols; 2819 ny = mat->num_rows; 2820 } 2821 #endif 2822 } else { 2823 /* z = A^T x + beta y 2824 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2825 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2826 */ 2827 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2828 dptr = zarray; 2829 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2830 if (compressed) { /* Scatter x to work vector */ 2831 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2832 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2833 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2834 VecCUDAEqualsReverse()); 2835 } 2836 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2837 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2838 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2839 nx = mat->num_rows; 2840 ny = mat->num_cols; 2841 } 2842 #endif 2843 } 2844 2845 /* csr_spmv does y = alpha op(A) x + beta y */ 2846 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2847 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2848 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2849 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2850 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2851 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2852 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2853 matstruct->matDescr, 2854 matstruct->cuSpMV[opA].vecXDescr, beta, 2855 matstruct->cuSpMV[opA].vecYDescr, 2856 cusparse_scalartype, 2857 cusparsestruct->spmvAlg, 2858 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2859 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2860 2861 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2862 } else { 2863 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2864 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2865 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2866 } 2867 2868 stat = cusparseSpMV(cusparsestruct->handle, opA, 2869 matstruct->alpha_one, 2870 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */ 2871 matstruct->cuSpMV[opA].vecXDescr, 2872 beta, 2873 matstruct->cuSpMV[opA].vecYDescr, 2874 cusparse_scalartype, 2875 cusparsestruct->spmvAlg, 2876 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2877 #else 2878 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2879 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2880 mat->num_rows, mat->num_cols, 2881 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2882 mat->values->data().get(), mat->row_offsets->data().get(), 2883 mat->column_indices->data().get(), xptr, beta, 2884 dptr);CHKERRCUSPARSE(stat); 2885 #endif 2886 } else { 2887 if (cusparsestruct->nrows) { 2888 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2889 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2890 #else 2891 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2892 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2893 matstruct->alpha_one, matstruct->descr, hybMat, 2894 xptr, beta, 2895 dptr);CHKERRCUSPARSE(stat); 2896 #endif 2897 } 2898 } 2899 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2900 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2901 2902 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2903 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2904 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2905 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2906 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2907 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2908 } 2909 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2910 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2911 } 2912 2913 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2914 if (compressed) { 2915 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2916 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 2917 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 2918 prevent that. So I just add a ScatterAdd kernel. 2919 */ 2920 #if 0 2921 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2922 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 2923 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2924 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2925 VecCUDAPlusEquals()); 2926 #else 2927 PetscInt n = matstruct->cprowIndices->size(); 2928 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 2929 #endif 2930 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2931 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2932 } 2933 } else { 2934 if (yy && yy != zz) { 2935 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2936 } 2937 } 2938 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2939 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2940 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2941 } catch(char *ex) { 2942 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2943 } 2944 if (yy) { 2945 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2946 } else { 2947 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2948 } 2949 PetscFunctionReturn(0); 2950 } 2951 2952 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2953 { 2954 PetscErrorCode ierr; 2955 2956 PetscFunctionBegin; 2957 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2958 PetscFunctionReturn(0); 2959 } 2960 2961 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2962 { 2963 PetscErrorCode ierr; 2964 PetscSplitCSRDataStructure *d_mat = NULL; 2965 PetscFunctionBegin; 2966 if (A->factortype == MAT_FACTOR_NONE) { 2967 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2968 } 2969 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2970 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2971 if (d_mat) { 2972 A->offloadmask = PETSC_OFFLOAD_GPU; 2973 } 2974 2975 PetscFunctionReturn(0); 2976 } 2977 2978 /* --------------------------------------------------------------------------------*/ 2979 /*@ 2980 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2981 (the default parallel PETSc format). This matrix will ultimately pushed down 2982 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2983 assembly performance the user should preallocate the matrix storage by setting 2984 the parameter nz (or the array nnz). By setting these parameters accurately, 2985 performance during matrix assembly can be increased by more than a factor of 50. 2986 2987 Collective 2988 2989 Input Parameters: 2990 + comm - MPI communicator, set to PETSC_COMM_SELF 2991 . m - number of rows 2992 . n - number of columns 2993 . nz - number of nonzeros per row (same for all rows) 2994 - nnz - array containing the number of nonzeros in the various rows 2995 (possibly different for each row) or NULL 2996 2997 Output Parameter: 2998 . A - the matrix 2999 3000 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3001 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3002 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3003 3004 Notes: 3005 If nnz is given then nz is ignored 3006 3007 The AIJ format (also called the Yale sparse matrix format or 3008 compressed row storage), is fully compatible with standard Fortran 77 3009 storage. That is, the stored row and column indices can begin at 3010 either one (as in Fortran) or zero. See the users' manual for details. 3011 3012 Specify the preallocated storage with either nz or nnz (not both). 3013 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3014 allocation. For large problems you MUST preallocate memory or you 3015 will get TERRIBLE performance, see the users' manual chapter on matrices. 3016 3017 By default, this format uses inodes (identical nodes) when possible, to 3018 improve numerical efficiency of matrix-vector products and solves. We 3019 search for consecutive rows with the same nonzero structure, thereby 3020 reusing matrix information to achieve increased efficiency. 3021 3022 Level: intermediate 3023 3024 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3025 @*/ 3026 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3027 { 3028 PetscErrorCode ierr; 3029 3030 PetscFunctionBegin; 3031 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3032 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3033 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3034 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3035 PetscFunctionReturn(0); 3036 } 3037 3038 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3039 { 3040 PetscErrorCode ierr; 3041 PetscSplitCSRDataStructure *d_mat = NULL; 3042 3043 PetscFunctionBegin; 3044 if (A->factortype == MAT_FACTOR_NONE) { 3045 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 3046 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 3047 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3048 } else { 3049 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3050 } 3051 if (d_mat) { 3052 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3053 cudaError_t err; 3054 PetscSplitCSRDataStructure h_mat; 3055 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 3056 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 3057 if (a->compressedrow.use) { 3058 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 3059 } 3060 err = cudaFree(d_mat);CHKERRCUDA(err); 3061 } 3062 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3063 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3064 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3065 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3066 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3067 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3068 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3069 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3070 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3071 PetscFunctionReturn(0); 3072 } 3073 3074 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3075 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3076 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3077 { 3078 PetscErrorCode ierr; 3079 3080 PetscFunctionBegin; 3081 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3082 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3083 PetscFunctionReturn(0); 3084 } 3085 3086 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3087 { 3088 PetscErrorCode ierr; 3089 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3090 Mat_SeqAIJCUSPARSE *cy; 3091 Mat_SeqAIJCUSPARSE *cx; 3092 PetscScalar *ay; 3093 const PetscScalar *ax; 3094 CsrMatrix *csry,*csrx; 3095 cudaError_t cerr; 3096 3097 PetscFunctionBegin; 3098 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3099 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3100 if (X->ops->axpy != Y->ops->axpy) { 3101 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3102 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3103 PetscFunctionReturn(0); 3104 } 3105 /* if we are here, it means both matrices are bound to GPU */ 3106 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3107 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3108 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3109 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3110 csry = (CsrMatrix*)cy->mat->mat; 3111 csrx = (CsrMatrix*)cx->mat->mat; 3112 /* see if we can turn this into a cublas axpy */ 3113 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3114 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3115 if (eq) { 3116 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3117 } 3118 if (eq) str = SAME_NONZERO_PATTERN; 3119 } 3120 /* spgeam is buggy with one column */ 3121 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3122 3123 if (str == SUBSET_NONZERO_PATTERN) { 3124 cusparseStatus_t stat; 3125 PetscScalar b = 1.0; 3126 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3127 size_t bufferSize; 3128 void *buffer; 3129 #endif 3130 3131 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3132 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3133 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3134 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3135 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3136 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3137 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3138 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3139 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3140 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3141 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3142 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3143 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3144 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3145 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3146 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3147 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3148 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3149 #else 3150 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3151 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3152 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3153 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3154 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3155 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3156 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3157 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3158 #endif 3159 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3160 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3161 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3162 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3163 } else if (str == SAME_NONZERO_PATTERN) { 3164 cublasHandle_t cublasv2handle; 3165 cublasStatus_t berr; 3166 PetscBLASInt one = 1, bnz = 1; 3167 3168 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3169 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3170 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3171 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3172 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3173 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3174 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3175 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3176 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3177 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3178 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3179 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3180 } else { 3181 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3182 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3183 } 3184 PetscFunctionReturn(0); 3185 } 3186 3187 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3188 { 3189 PetscErrorCode ierr; 3190 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3191 PetscScalar *ay; 3192 cudaError_t cerr; 3193 cublasHandle_t cublasv2handle; 3194 cublasStatus_t berr; 3195 PetscBLASInt one = 1, bnz = 1; 3196 3197 PetscFunctionBegin; 3198 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3199 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3200 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3201 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3202 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3203 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3204 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3205 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3206 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3207 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3208 PetscFunctionReturn(0); 3209 } 3210 3211 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3212 { 3213 PetscErrorCode ierr; 3214 PetscBool both = PETSC_FALSE; 3215 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3216 3217 PetscFunctionBegin; 3218 if (A->factortype == MAT_FACTOR_NONE) { 3219 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3220 if (spptr->mat) { 3221 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3222 if (matrix->values) { 3223 both = PETSC_TRUE; 3224 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3225 } 3226 } 3227 if (spptr->matTranspose) { 3228 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3229 if (matrix->values) { 3230 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3231 } 3232 } 3233 } 3234 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3235 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3236 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3237 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3238 else A->offloadmask = PETSC_OFFLOAD_CPU; 3239 3240 PetscFunctionReturn(0); 3241 } 3242 3243 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3244 { 3245 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3246 PetscErrorCode ierr; 3247 3248 PetscFunctionBegin; 3249 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3250 if (flg) { 3251 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3252 3253 A->ops->scale = MatScale_SeqAIJ; 3254 A->ops->axpy = MatAXPY_SeqAIJ; 3255 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3256 A->ops->mult = MatMult_SeqAIJ; 3257 A->ops->multadd = MatMultAdd_SeqAIJ; 3258 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3259 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3260 A->ops->multhermitiantranspose = NULL; 3261 A->ops->multhermitiantransposeadd = NULL; 3262 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3263 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3264 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3265 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3266 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3267 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3268 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3269 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3270 } else { 3271 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3272 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3273 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3274 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3275 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3276 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3277 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3278 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3279 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3280 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3281 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3282 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3283 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3284 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3285 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3286 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3287 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3288 } 3289 A->boundtocpu = flg; 3290 a->inode.use = flg; 3291 PetscFunctionReturn(0); 3292 } 3293 3294 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3295 { 3296 PetscErrorCode ierr; 3297 cusparseStatus_t stat; 3298 Mat B; 3299 3300 PetscFunctionBegin; 3301 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3302 if (reuse == MAT_INITIAL_MATRIX) { 3303 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3304 } else if (reuse == MAT_REUSE_MATRIX) { 3305 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3306 } 3307 B = *newmat; 3308 3309 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3310 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3311 3312 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3313 if (B->factortype == MAT_FACTOR_NONE) { 3314 Mat_SeqAIJCUSPARSE *spptr; 3315 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3316 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3317 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3318 spptr->format = MAT_CUSPARSE_CSR; 3319 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3320 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3321 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3322 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3323 #endif 3324 B->spptr = spptr; 3325 } else { 3326 Mat_SeqAIJCUSPARSETriFactors *spptr; 3327 3328 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3329 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3330 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3331 B->spptr = spptr; 3332 } 3333 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3334 } 3335 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3336 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3337 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3338 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3339 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3340 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3341 3342 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3343 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3344 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3345 PetscFunctionReturn(0); 3346 } 3347 3348 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3349 { 3350 PetscErrorCode ierr; 3351 3352 PetscFunctionBegin; 3353 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3354 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3355 PetscFunctionReturn(0); 3356 } 3357 3358 /*MC 3359 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3360 3361 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3362 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3363 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3364 3365 Options Database Keys: 3366 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3367 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3368 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3369 3370 Level: beginner 3371 3372 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3373 M*/ 3374 3375 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3376 3377 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3378 { 3379 PetscErrorCode ierr; 3380 3381 PetscFunctionBegin; 3382 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 3383 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3384 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3385 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3386 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3387 3388 PetscFunctionReturn(0); 3389 } 3390 3391 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3392 { 3393 PetscErrorCode ierr; 3394 cusparseStatus_t stat; 3395 3396 PetscFunctionBegin; 3397 if (*cusparsestruct) { 3398 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3399 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3400 delete (*cusparsestruct)->workVector; 3401 delete (*cusparsestruct)->rowoffsets_gpu; 3402 delete (*cusparsestruct)->cooPerm; 3403 delete (*cusparsestruct)->cooPerm_a; 3404 delete (*cusparsestruct)->csr2csc_i; 3405 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3406 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3407 } 3408 PetscFunctionReturn(0); 3409 } 3410 3411 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3412 { 3413 PetscFunctionBegin; 3414 if (*mat) { 3415 delete (*mat)->values; 3416 delete (*mat)->column_indices; 3417 delete (*mat)->row_offsets; 3418 delete *mat; 3419 *mat = 0; 3420 } 3421 PetscFunctionReturn(0); 3422 } 3423 3424 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3425 { 3426 cusparseStatus_t stat; 3427 PetscErrorCode ierr; 3428 3429 PetscFunctionBegin; 3430 if (*trifactor) { 3431 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3432 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3433 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3434 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3435 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3436 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3437 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3438 #endif 3439 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3440 } 3441 PetscFunctionReturn(0); 3442 } 3443 3444 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3445 { 3446 CsrMatrix *mat; 3447 cusparseStatus_t stat; 3448 cudaError_t err; 3449 3450 PetscFunctionBegin; 3451 if (*matstruct) { 3452 if ((*matstruct)->mat) { 3453 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3454 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3455 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3456 #else 3457 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3458 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3459 #endif 3460 } else { 3461 mat = (CsrMatrix*)(*matstruct)->mat; 3462 CsrMatrix_Destroy(&mat); 3463 } 3464 } 3465 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3466 delete (*matstruct)->cprowIndices; 3467 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3468 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3469 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3470 3471 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3472 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3473 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3474 for (int i=0; i<3; i++) { 3475 if (mdata->cuSpMV[i].initialized) { 3476 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3477 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3478 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3479 } 3480 } 3481 #endif 3482 delete *matstruct; 3483 *matstruct = NULL; 3484 } 3485 PetscFunctionReturn(0); 3486 } 3487 3488 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3489 { 3490 PetscErrorCode ierr; 3491 3492 PetscFunctionBegin; 3493 if (*trifactors) { 3494 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3495 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3496 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3497 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3498 delete (*trifactors)->rpermIndices; 3499 delete (*trifactors)->cpermIndices; 3500 delete (*trifactors)->workVector; 3501 (*trifactors)->rpermIndices = NULL; 3502 (*trifactors)->cpermIndices = NULL; 3503 (*trifactors)->workVector = NULL; 3504 if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3505 if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3506 (*trifactors)->init_dev_prop = PETSC_FALSE; 3507 } 3508 PetscFunctionReturn(0); 3509 } 3510 3511 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3512 { 3513 PetscErrorCode ierr; 3514 cusparseHandle_t handle; 3515 cusparseStatus_t stat; 3516 3517 PetscFunctionBegin; 3518 if (*trifactors) { 3519 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3520 if (handle = (*trifactors)->handle) { 3521 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3522 } 3523 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3524 } 3525 PetscFunctionReturn(0); 3526 } 3527 3528 struct IJCompare 3529 { 3530 __host__ __device__ 3531 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3532 { 3533 if (t1.get<0>() < t2.get<0>()) return true; 3534 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3535 return false; 3536 } 3537 }; 3538 3539 struct IJEqual 3540 { 3541 __host__ __device__ 3542 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3543 { 3544 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3545 return true; 3546 } 3547 }; 3548 3549 struct IJDiff 3550 { 3551 __host__ __device__ 3552 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3553 { 3554 return t1 == t2 ? 0 : 1; 3555 } 3556 }; 3557 3558 struct IJSum 3559 { 3560 __host__ __device__ 3561 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3562 { 3563 return t1||t2; 3564 } 3565 }; 3566 3567 #include <thrust/iterator/discard_iterator.h> 3568 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3569 { 3570 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3571 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3572 THRUSTARRAY *cooPerm_v = NULL; 3573 thrust::device_ptr<const PetscScalar> d_v; 3574 CsrMatrix *matrix; 3575 PetscErrorCode ierr; 3576 cudaError_t cerr; 3577 PetscInt n; 3578 3579 PetscFunctionBegin; 3580 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3581 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3582 if (!cusp->cooPerm) { 3583 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3584 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3585 PetscFunctionReturn(0); 3586 } 3587 matrix = (CsrMatrix*)cusp->mat->mat; 3588 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3589 if (!v) { 3590 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3591 goto finalize; 3592 } 3593 n = cusp->cooPerm->size(); 3594 if (isCudaMem(v)) { 3595 d_v = thrust::device_pointer_cast(v); 3596 } else { 3597 cooPerm_v = new THRUSTARRAY(n); 3598 cooPerm_v->assign(v,v+n); 3599 d_v = cooPerm_v->data(); 3600 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3601 } 3602 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3603 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3604 if (cusp->cooPerm_a) { 3605 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3606 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3607 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3608 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3609 delete cooPerm_w; 3610 } else { 3611 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3612 matrix->values->begin())); 3613 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3614 matrix->values->end())); 3615 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 3616 } 3617 } else { 3618 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3619 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3620 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3621 } else { 3622 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3623 matrix->values->begin())); 3624 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3625 matrix->values->end())); 3626 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3627 } 3628 } 3629 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3630 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3631 finalize: 3632 delete cooPerm_v; 3633 A->offloadmask = PETSC_OFFLOAD_GPU; 3634 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3635 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3636 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3637 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3638 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3639 a->reallocs = 0; 3640 A->info.mallocs += 0; 3641 A->info.nz_unneeded = 0; 3642 A->assembled = A->was_assembled = PETSC_TRUE; 3643 A->num_ass++; 3644 PetscFunctionReturn(0); 3645 } 3646 3647 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3648 { 3649 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3650 PetscErrorCode ierr; 3651 3652 PetscFunctionBegin; 3653 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3654 if (!cusp) PetscFunctionReturn(0); 3655 if (destroy) { 3656 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3657 delete cusp->csr2csc_i; 3658 cusp->csr2csc_i = NULL; 3659 } 3660 A->transupdated = PETSC_FALSE; 3661 PetscFunctionReturn(0); 3662 } 3663 3664 #include <thrust/binary_search.h> 3665 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3666 { 3667 PetscErrorCode ierr; 3668 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3669 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3670 PetscInt cooPerm_n, nzr = 0; 3671 cudaError_t cerr; 3672 3673 PetscFunctionBegin; 3674 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3675 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3676 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3677 if (n != cooPerm_n) { 3678 delete cusp->cooPerm; 3679 delete cusp->cooPerm_a; 3680 cusp->cooPerm = NULL; 3681 cusp->cooPerm_a = NULL; 3682 } 3683 if (n) { 3684 THRUSTINTARRAY d_i(n); 3685 THRUSTINTARRAY d_j(n); 3686 THRUSTINTARRAY ii(A->rmap->n); 3687 3688 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3689 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3690 3691 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3692 d_i.assign(coo_i,coo_i+n); 3693 d_j.assign(coo_j,coo_j+n); 3694 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3695 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3696 3697 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3698 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3699 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 3700 *cusp->cooPerm_a = d_i; 3701 THRUSTINTARRAY w = d_j; 3702 3703 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 3704 if (nekey == ekey) { /* all entries are unique */ 3705 delete cusp->cooPerm_a; 3706 cusp->cooPerm_a = NULL; 3707 } else { /* I couldn't come up with a more elegant algorithm */ 3708 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 3709 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 3710 (*cusp->cooPerm_a)[0] = 0; 3711 w[0] = 0; 3712 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 3713 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 3714 } 3715 thrust::counting_iterator<PetscInt> search_begin(0); 3716 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 3717 search_begin, search_begin + A->rmap->n, 3718 ii.begin()); 3719 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3720 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3721 3722 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3723 a->singlemalloc = PETSC_FALSE; 3724 a->free_a = PETSC_TRUE; 3725 a->free_ij = PETSC_TRUE; 3726 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3727 a->i[0] = 0; 3728 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3729 a->nz = a->maxnz = a->i[A->rmap->n]; 3730 a->rmax = 0; 3731 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3732 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3733 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3734 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3735 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3736 for (PetscInt i = 0; i < A->rmap->n; i++) { 3737 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3738 nzr += (PetscInt)!!(nnzr); 3739 a->ilen[i] = a->imax[i] = nnzr; 3740 a->rmax = PetscMax(a->rmax,nnzr); 3741 } 3742 a->nonzerorowcnt = nzr; 3743 A->preallocated = PETSC_TRUE; 3744 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3745 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3746 } else { 3747 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3748 } 3749 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3750 3751 /* We want to allocate the CUSPARSE struct for matvec now. 3752 The code is so convoluted now that I prefer to copy zeros */ 3753 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3754 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3755 A->offloadmask = PETSC_OFFLOAD_CPU; 3756 A->nonzerostate++; 3757 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3758 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 3759 3760 A->assembled = PETSC_FALSE; 3761 A->was_assembled = PETSC_FALSE; 3762 PetscFunctionReturn(0); 3763 } 3764 3765 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 3766 { 3767 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3768 CsrMatrix *csr; 3769 PetscErrorCode ierr; 3770 3771 PetscFunctionBegin; 3772 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3773 PetscValidPointer(a,2); 3774 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3775 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3776 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3777 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3778 csr = (CsrMatrix*)cusp->mat->mat; 3779 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3780 *a = csr->values->data().get(); 3781 PetscFunctionReturn(0); 3782 } 3783 3784 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 3785 { 3786 PetscFunctionBegin; 3787 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3788 PetscValidPointer(a,2); 3789 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3790 *a = NULL; 3791 PetscFunctionReturn(0); 3792 } 3793 3794 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 3795 { 3796 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3797 CsrMatrix *csr; 3798 PetscErrorCode ierr; 3799 3800 PetscFunctionBegin; 3801 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3802 PetscValidPointer(a,2); 3803 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3804 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3805 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3806 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3807 csr = (CsrMatrix*)cusp->mat->mat; 3808 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3809 *a = csr->values->data().get(); 3810 A->offloadmask = PETSC_OFFLOAD_GPU; 3811 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3812 PetscFunctionReturn(0); 3813 } 3814 3815 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 3816 { 3817 PetscErrorCode ierr; 3818 3819 PetscFunctionBegin; 3820 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3821 PetscValidPointer(a,2); 3822 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3823 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3824 *a = NULL; 3825 PetscFunctionReturn(0); 3826 } 3827 3828 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 3829 { 3830 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3831 CsrMatrix *csr; 3832 PetscErrorCode ierr; 3833 3834 PetscFunctionBegin; 3835 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3836 PetscValidPointer(a,2); 3837 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3838 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3839 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3840 csr = (CsrMatrix*)cusp->mat->mat; 3841 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3842 *a = csr->values->data().get(); 3843 A->offloadmask = PETSC_OFFLOAD_GPU; 3844 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3845 PetscFunctionReturn(0); 3846 } 3847 3848 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 3849 { 3850 PetscErrorCode ierr; 3851 3852 PetscFunctionBegin; 3853 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3854 PetscValidPointer(a,2); 3855 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3856 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3857 *a = NULL; 3858 PetscFunctionReturn(0); 3859 } 3860 3861 struct IJCompare4 3862 { 3863 __host__ __device__ 3864 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 3865 { 3866 if (t1.get<0>() < t2.get<0>()) return true; 3867 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3868 return false; 3869 } 3870 }; 3871 3872 struct Shift 3873 { 3874 int _shift; 3875 3876 Shift(int shift) : _shift(shift) {} 3877 __host__ __device__ 3878 inline int operator() (const int &c) 3879 { 3880 return c + _shift; 3881 } 3882 }; 3883 3884 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */ 3885 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 3886 { 3887 PetscErrorCode ierr; 3888 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 3889 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 3890 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 3891 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3892 PetscInt Annz,Bnnz; 3893 cusparseStatus_t stat; 3894 PetscInt i,m,n,zero = 0; 3895 cudaError_t cerr; 3896 3897 PetscFunctionBegin; 3898 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3899 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 3900 PetscValidPointer(C,4); 3901 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3902 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 3903 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 3904 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 3905 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3906 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3907 if (reuse == MAT_INITIAL_MATRIX) { 3908 m = A->rmap->n; 3909 n = A->cmap->n + B->cmap->n; 3910 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 3911 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 3912 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3913 c = (Mat_SeqAIJ*)(*C)->data; 3914 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3915 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3916 Ccsr = new CsrMatrix; 3917 Cmat->cprowIndices = NULL; 3918 c->compressedrow.use = PETSC_FALSE; 3919 c->compressedrow.nrows = 0; 3920 c->compressedrow.i = NULL; 3921 c->compressedrow.rindex = NULL; 3922 Ccusp->workVector = NULL; 3923 Ccusp->nrows = m; 3924 Ccusp->mat = Cmat; 3925 Ccusp->mat->mat = Ccsr; 3926 Ccsr->num_rows = m; 3927 Ccsr->num_cols = n; 3928 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 3929 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3930 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3931 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3932 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3933 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3934 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3935 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3936 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3937 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3938 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 3939 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 3940 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 3941 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3942 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3943 3944 Acsr = (CsrMatrix*)Acusp->mat->mat; 3945 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 3946 Annz = (PetscInt)Acsr->column_indices->size(); 3947 Bnnz = (PetscInt)Bcsr->column_indices->size(); 3948 c->nz = Annz + Bnnz; 3949 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 3950 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3951 Ccsr->values = new THRUSTARRAY(c->nz); 3952 Ccsr->num_entries = c->nz; 3953 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 3954 if (c->nz) { 3955 auto Acoo = new THRUSTINTARRAY32(Annz); 3956 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 3957 auto Ccoo = new THRUSTINTARRAY32(c->nz); 3958 THRUSTINTARRAY32 *Aroff,*Broff; 3959 3960 if (a->compressedrow.use) { /* need full row offset */ 3961 if (!Acusp->rowoffsets_gpu) { 3962 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 3963 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 3964 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3965 } 3966 Aroff = Acusp->rowoffsets_gpu; 3967 } else Aroff = Acsr->row_offsets; 3968 if (b->compressedrow.use) { /* need full row offset */ 3969 if (!Bcusp->rowoffsets_gpu) { 3970 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3971 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 3972 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3973 } 3974 Broff = Bcusp->rowoffsets_gpu; 3975 } else Broff = Bcsr->row_offsets; 3976 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3977 stat = cusparseXcsr2coo(Acusp->handle, 3978 Aroff->data().get(), 3979 Annz, 3980 m, 3981 Acoo->data().get(), 3982 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3983 stat = cusparseXcsr2coo(Bcusp->handle, 3984 Broff->data().get(), 3985 Bnnz, 3986 m, 3987 Bcoo->data().get(), 3988 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3989 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 3990 auto Aperm = thrust::make_constant_iterator(1); 3991 auto Bperm = thrust::make_constant_iterator(0); 3992 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 3993 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 3994 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 3995 #else 3996 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 3997 auto Bcib = Bcsr->column_indices->begin(); 3998 auto Bcie = Bcsr->column_indices->end(); 3999 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4000 #endif 4001 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4002 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4003 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4004 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4005 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4006 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4007 auto p1 = Ccusp->cooPerm->begin(); 4008 auto p2 = Ccusp->cooPerm->begin(); 4009 thrust::advance(p2,Annz); 4010 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4011 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4012 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4013 #endif 4014 auto cci = thrust::make_counting_iterator(zero); 4015 auto cce = thrust::make_counting_iterator(c->nz); 4016 #if 0 //Errors on SUMMIT cuda 11.1.0 4017 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4018 #else 4019 auto pred = thrust::identity<int>(); 4020 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4021 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4022 #endif 4023 stat = cusparseXcoo2csr(Ccusp->handle, 4024 Ccoo->data().get(), 4025 c->nz, 4026 m, 4027 Ccsr->row_offsets->data().get(), 4028 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4029 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4030 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4031 delete wPerm; 4032 delete Acoo; 4033 delete Bcoo; 4034 delete Ccoo; 4035 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4036 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4037 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4038 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4039 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4040 #endif 4041 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4042 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4043 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4044 CsrMatrix *CcsrT = new CsrMatrix; 4045 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4046 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4047 4048 (*C)->form_explicit_transpose = PETSC_TRUE; 4049 (*C)->transupdated = PETSC_TRUE; 4050 Ccusp->rowoffsets_gpu = NULL; 4051 CmatT->cprowIndices = NULL; 4052 CmatT->mat = CcsrT; 4053 CcsrT->num_rows = n; 4054 CcsrT->num_cols = m; 4055 CcsrT->num_entries = c->nz; 4056 4057 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4058 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4059 CcsrT->values = new THRUSTARRAY(c->nz); 4060 4061 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4062 auto rT = CcsrT->row_offsets->begin(); 4063 if (AT) { 4064 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4065 thrust::advance(rT,-1); 4066 } 4067 if (BT) { 4068 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4069 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4070 thrust::copy(titb,tite,rT); 4071 } 4072 auto cT = CcsrT->column_indices->begin(); 4073 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4074 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4075 auto vT = CcsrT->values->begin(); 4076 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4077 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4078 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4079 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4080 4081 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4082 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4083 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4084 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4085 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4086 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4087 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4088 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4089 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4090 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4091 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4092 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4093 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4094 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4095 #endif 4096 Ccusp->matTranspose = CmatT; 4097 } 4098 } 4099 4100 c->singlemalloc = PETSC_FALSE; 4101 c->free_a = PETSC_TRUE; 4102 c->free_ij = PETSC_TRUE; 4103 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4104 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4105 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4106 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4107 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4108 ii = *Ccsr->row_offsets; 4109 jj = *Ccsr->column_indices; 4110 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4111 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4112 } else { 4113 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4114 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4115 } 4116 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4117 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4118 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4119 c->maxnz = c->nz; 4120 c->nonzerorowcnt = 0; 4121 c->rmax = 0; 4122 for (i = 0; i < m; i++) { 4123 const PetscInt nn = c->i[i+1] - c->i[i]; 4124 c->ilen[i] = c->imax[i] = nn; 4125 c->nonzerorowcnt += (PetscInt)!!nn; 4126 c->rmax = PetscMax(c->rmax,nn); 4127 } 4128 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4129 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4130 (*C)->nonzerostate++; 4131 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4132 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4133 Ccusp->nonzerostate = (*C)->nonzerostate; 4134 (*C)->preallocated = PETSC_TRUE; 4135 } else { 4136 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4137 c = (Mat_SeqAIJ*)(*C)->data; 4138 if (c->nz) { 4139 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4140 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4141 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4142 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4143 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4144 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4145 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4146 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4147 Acsr = (CsrMatrix*)Acusp->mat->mat; 4148 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4149 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4150 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4151 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4152 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4153 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4154 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4155 auto pmid = Ccusp->cooPerm->begin(); 4156 thrust::advance(pmid,Acsr->num_entries); 4157 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4158 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4159 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4160 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4161 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4162 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4163 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4164 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4165 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4166 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4167 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4168 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4169 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4170 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4171 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4172 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4173 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4174 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4175 auto vT = CcsrT->values->begin(); 4176 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4177 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4178 (*C)->transupdated = PETSC_TRUE; 4179 } 4180 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4181 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4182 } 4183 } 4184 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4185 (*C)->assembled = PETSC_TRUE; 4186 (*C)->was_assembled = PETSC_FALSE; 4187 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4188 PetscFunctionReturn(0); 4189 } 4190 4191 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4192 { 4193 PetscErrorCode ierr; 4194 bool dmem; 4195 const PetscScalar *av; 4196 cudaError_t cerr; 4197 4198 PetscFunctionBegin; 4199 dmem = isCudaMem(v); 4200 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4201 if (n && idx) { 4202 THRUSTINTARRAY widx(n); 4203 widx.assign(idx,idx+n); 4204 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4205 4206 THRUSTARRAY *w = NULL; 4207 thrust::device_ptr<PetscScalar> dv; 4208 if (dmem) { 4209 dv = thrust::device_pointer_cast(v); 4210 } else { 4211 w = new THRUSTARRAY(n); 4212 dv = w->data(); 4213 } 4214 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4215 4216 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4217 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4218 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4219 if (w) { 4220 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4221 } 4222 delete w; 4223 } else { 4224 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4225 } 4226 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4227 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4228 PetscFunctionReturn(0); 4229 } 4230