1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/async/for_each.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 76 77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 82 83 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 85 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 86 87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 89 90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 91 92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 93 { 94 cusparseStatus_t stat; 95 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 96 97 PetscFunctionBegin; 98 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 99 cusparsestruct->stream = stream; 100 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 101 PetscFunctionReturn(0); 102 } 103 104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 105 { 106 cusparseStatus_t stat; 107 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 108 109 PetscFunctionBegin; 110 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 111 if (cusparsestruct->handle != handle) { 112 if (cusparsestruct->handle) { 113 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 114 } 115 cusparsestruct->handle = handle; 116 } 117 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 118 PetscFunctionReturn(0); 119 } 120 121 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 122 { 123 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 124 PetscBool flg; 125 PetscErrorCode ierr; 126 127 PetscFunctionBegin; 128 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 129 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 130 if (cusparsestruct->handle) cusparsestruct->handle = 0; 131 PetscFunctionReturn(0); 132 } 133 134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 135 { 136 PetscFunctionBegin; 137 *type = MATSOLVERCUSPARSE; 138 PetscFunctionReturn(0); 139 } 140 141 /*MC 142 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 143 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 144 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 145 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 146 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 147 algorithms are not recommended. This class does NOT support direct solver operations. 148 149 Level: beginner 150 151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 152 M*/ 153 154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 155 { 156 PetscErrorCode ierr; 157 PetscInt n = A->rmap->n; 158 159 PetscFunctionBegin; 160 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 161 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 162 (*B)->factortype = ftype; 163 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 164 165 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 166 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 167 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 168 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 169 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 170 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 171 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 172 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 173 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 174 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 175 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 176 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 177 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 178 179 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 180 (*B)->canuseordering = PETSC_TRUE; 181 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 182 PetscFunctionReturn(0); 183 } 184 185 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 186 { 187 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 188 189 PetscFunctionBegin; 190 switch (op) { 191 case MAT_CUSPARSE_MULT: 192 cusparsestruct->format = format; 193 break; 194 case MAT_CUSPARSE_ALL: 195 cusparsestruct->format = format; 196 break; 197 default: 198 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 199 } 200 PetscFunctionReturn(0); 201 } 202 203 /*@ 204 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 205 operation. Only the MatMult operation can use different GPU storage formats 206 for MPIAIJCUSPARSE matrices. 207 Not Collective 208 209 Input Parameters: 210 + A - Matrix of type SEQAIJCUSPARSE 211 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 212 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 213 214 Output Parameter: 215 216 Level: intermediate 217 218 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 219 @*/ 220 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 221 { 222 PetscErrorCode ierr; 223 224 PetscFunctionBegin; 225 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 226 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 227 PetscFunctionReturn(0); 228 } 229 230 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 231 { 232 PetscErrorCode ierr; 233 234 PetscFunctionBegin; 235 switch (op) { 236 case MAT_FORM_EXPLICIT_TRANSPOSE: 237 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 238 if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 239 A->form_explicit_transpose = flg; 240 break; 241 default: 242 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 243 break; 244 } 245 PetscFunctionReturn(0); 246 } 247 248 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 249 250 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 251 { 252 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 253 IS isrow = b->row,iscol = b->col; 254 PetscBool row_identity,col_identity; 255 PetscErrorCode ierr; 256 257 PetscFunctionBegin; 258 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 259 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 260 B->offloadmask = PETSC_OFFLOAD_CPU; 261 /* determine which version of MatSolve needs to be used. */ 262 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 263 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 264 if (row_identity && col_identity) { 265 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 266 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 267 B->ops->matsolve = NULL; 268 B->ops->matsolvetranspose = NULL; 269 } else { 270 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 271 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 272 B->ops->matsolve = NULL; 273 B->ops->matsolvetranspose = NULL; 274 } 275 276 /* get the triangular factors */ 277 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 278 PetscFunctionReturn(0); 279 } 280 281 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 282 { 283 PetscErrorCode ierr; 284 MatCUSPARSEStorageFormat format; 285 PetscBool flg; 286 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 287 288 PetscFunctionBegin; 289 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 290 if (A->factortype == MAT_FACTOR_NONE) { 291 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 292 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 293 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 294 295 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 296 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 297 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 298 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 299 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 300 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 301 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 302 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 303 if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 304 #else 305 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 306 #endif 307 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 308 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 309 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 310 311 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 312 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 313 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 314 #endif 315 } 316 ierr = PetscOptionsTail();CHKERRQ(ierr); 317 PetscFunctionReturn(0); 318 } 319 320 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 321 { 322 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 323 PetscErrorCode ierr; 324 325 PetscFunctionBegin; 326 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 327 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 328 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 329 PetscFunctionReturn(0); 330 } 331 332 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 333 { 334 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 335 PetscErrorCode ierr; 336 337 PetscFunctionBegin; 338 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 339 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 340 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 341 PetscFunctionReturn(0); 342 } 343 344 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 345 { 346 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 347 PetscErrorCode ierr; 348 349 PetscFunctionBegin; 350 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 351 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 352 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 353 PetscFunctionReturn(0); 354 } 355 356 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 357 { 358 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 363 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 364 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 365 PetscFunctionReturn(0); 366 } 367 368 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 369 { 370 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 371 PetscInt n = A->rmap->n; 372 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 373 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 374 cusparseStatus_t stat; 375 const PetscInt *ai = a->i,*aj = a->j,*vi; 376 const MatScalar *aa = a->a,*v; 377 PetscInt *AiLo, *AjLo; 378 PetscInt i,nz, nzLower, offset, rowOffset; 379 PetscErrorCode ierr; 380 cudaError_t cerr; 381 382 PetscFunctionBegin; 383 if (!n) PetscFunctionReturn(0); 384 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 385 try { 386 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 387 nzLower=n+ai[n]-ai[1]; 388 if (!loTriFactor) { 389 PetscScalar *AALo; 390 391 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 392 393 /* Allocate Space for the lower triangular matrix */ 394 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 395 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 396 397 /* Fill the lower triangular matrix */ 398 AiLo[0] = (PetscInt) 0; 399 AiLo[n] = nzLower; 400 AjLo[0] = (PetscInt) 0; 401 AALo[0] = (MatScalar) 1.0; 402 v = aa; 403 vi = aj; 404 offset = 1; 405 rowOffset= 1; 406 for (i=1; i<n; i++) { 407 nz = ai[i+1] - ai[i]; 408 /* additional 1 for the term on the diagonal */ 409 AiLo[i] = rowOffset; 410 rowOffset += nz+1; 411 412 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 413 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 414 415 offset += nz; 416 AjLo[offset] = (PetscInt) i; 417 AALo[offset] = (MatScalar) 1.0; 418 offset += 1; 419 420 v += nz; 421 vi += nz; 422 } 423 424 /* allocate space for the triangular factor information */ 425 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 426 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 427 /* Create the matrix description */ 428 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 429 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 430 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 431 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 432 #else 433 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 434 #endif 435 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 436 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 437 438 /* set the operation */ 439 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 440 441 /* set the matrix */ 442 loTriFactor->csrMat = new CsrMatrix; 443 loTriFactor->csrMat->num_rows = n; 444 loTriFactor->csrMat->num_cols = n; 445 loTriFactor->csrMat->num_entries = nzLower; 446 447 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 448 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 449 450 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 451 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 452 453 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 454 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 455 456 /* Create the solve analysis information */ 457 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 458 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 459 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 460 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 461 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 462 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 463 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 464 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 465 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 466 #endif 467 468 /* perform the solve analysis */ 469 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 470 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 471 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 472 loTriFactor->csrMat->column_indices->data().get(), 473 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 474 loTriFactor->solveInfo, 475 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 476 #else 477 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 478 #endif 479 cerr = WaitForCUDA();CHKERRCUDA(cerr); 480 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 481 482 /* assign the pointer */ 483 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 484 loTriFactor->AA_h = AALo; 485 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 486 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 487 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 488 } else { /* update values only */ 489 if (!loTriFactor->AA_h) { 490 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 491 } 492 /* Fill the lower triangular matrix */ 493 loTriFactor->AA_h[0] = 1.0; 494 v = aa; 495 vi = aj; 496 offset = 1; 497 for (i=1; i<n; i++) { 498 nz = ai[i+1] - ai[i]; 499 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 500 offset += nz; 501 loTriFactor->AA_h[offset] = 1.0; 502 offset += 1; 503 v += nz; 504 } 505 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 506 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 507 } 508 } catch(char *ex) { 509 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 510 } 511 } 512 PetscFunctionReturn(0); 513 } 514 515 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 516 { 517 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 518 PetscInt n = A->rmap->n; 519 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 520 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 521 cusparseStatus_t stat; 522 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 523 const MatScalar *aa = a->a,*v; 524 PetscInt *AiUp, *AjUp; 525 PetscInt i,nz, nzUpper, offset; 526 PetscErrorCode ierr; 527 cudaError_t cerr; 528 529 PetscFunctionBegin; 530 if (!n) PetscFunctionReturn(0); 531 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 532 try { 533 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 534 nzUpper = adiag[0]-adiag[n]; 535 if (!upTriFactor) { 536 PetscScalar *AAUp; 537 538 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 539 540 /* Allocate Space for the upper triangular matrix */ 541 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 542 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 543 544 /* Fill the upper triangular matrix */ 545 AiUp[0]=(PetscInt) 0; 546 AiUp[n]=nzUpper; 547 offset = nzUpper; 548 for (i=n-1; i>=0; i--) { 549 v = aa + adiag[i+1] + 1; 550 vi = aj + adiag[i+1] + 1; 551 552 /* number of elements NOT on the diagonal */ 553 nz = adiag[i] - adiag[i+1]-1; 554 555 /* decrement the offset */ 556 offset -= (nz+1); 557 558 /* first, set the diagonal elements */ 559 AjUp[offset] = (PetscInt) i; 560 AAUp[offset] = (MatScalar)1./v[nz]; 561 AiUp[i] = AiUp[i+1] - (nz+1); 562 563 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 564 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 565 } 566 567 /* allocate space for the triangular factor information */ 568 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 569 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 570 571 /* Create the matrix description */ 572 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 573 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 574 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 575 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 576 #else 577 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 578 #endif 579 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 580 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 581 582 /* set the operation */ 583 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 584 585 /* set the matrix */ 586 upTriFactor->csrMat = new CsrMatrix; 587 upTriFactor->csrMat->num_rows = n; 588 upTriFactor->csrMat->num_cols = n; 589 upTriFactor->csrMat->num_entries = nzUpper; 590 591 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 592 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 593 594 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 595 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 596 597 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 598 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 599 600 /* Create the solve analysis information */ 601 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 602 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 603 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 604 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 605 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 606 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 607 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 608 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 609 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 610 #endif 611 612 /* perform the solve analysis */ 613 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 614 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 615 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 616 upTriFactor->csrMat->column_indices->data().get(), 617 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 618 upTriFactor->solveInfo, 619 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 620 #else 621 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 622 #endif 623 cerr = WaitForCUDA();CHKERRCUDA(cerr); 624 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 625 626 /* assign the pointer */ 627 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 628 upTriFactor->AA_h = AAUp; 629 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 630 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 631 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 632 } else { 633 if (!upTriFactor->AA_h) { 634 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 635 } 636 /* Fill the upper triangular matrix */ 637 offset = nzUpper; 638 for (i=n-1; i>=0; i--) { 639 v = aa + adiag[i+1] + 1; 640 641 /* number of elements NOT on the diagonal */ 642 nz = adiag[i] - adiag[i+1]-1; 643 644 /* decrement the offset */ 645 offset -= (nz+1); 646 647 /* first, set the diagonal elements */ 648 upTriFactor->AA_h[offset] = 1./v[nz]; 649 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 650 } 651 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 652 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 653 } 654 } catch(char *ex) { 655 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 656 } 657 } 658 PetscFunctionReturn(0); 659 } 660 661 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 662 { 663 PetscErrorCode ierr; 664 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 665 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 666 IS isrow = a->row,iscol = a->icol; 667 PetscBool row_identity,col_identity; 668 PetscInt n = A->rmap->n; 669 670 PetscFunctionBegin; 671 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 672 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 673 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 674 675 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 676 cusparseTriFactors->nnz=a->nz; 677 678 A->offloadmask = PETSC_OFFLOAD_BOTH; 679 /* lower triangular indices */ 680 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 681 if (!row_identity && !cusparseTriFactors->rpermIndices) { 682 const PetscInt *r; 683 684 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 685 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 686 cusparseTriFactors->rpermIndices->assign(r, r+n); 687 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 688 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 689 } 690 691 /* upper triangular indices */ 692 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 693 if (!col_identity && !cusparseTriFactors->cpermIndices) { 694 const PetscInt *c; 695 696 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 697 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 698 cusparseTriFactors->cpermIndices->assign(c, c+n); 699 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 700 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 701 } 702 PetscFunctionReturn(0); 703 } 704 705 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 706 { 707 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 708 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 709 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 710 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 711 cusparseStatus_t stat; 712 PetscErrorCode ierr; 713 cudaError_t cerr; 714 PetscInt *AiUp, *AjUp; 715 PetscScalar *AAUp; 716 PetscScalar *AALo; 717 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 718 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 719 const PetscInt *ai = b->i,*aj = b->j,*vj; 720 const MatScalar *aa = b->a,*v; 721 722 PetscFunctionBegin; 723 if (!n) PetscFunctionReturn(0); 724 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 725 try { 726 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 727 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 728 if (!upTriFactor && !loTriFactor) { 729 /* Allocate Space for the upper triangular matrix */ 730 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 731 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 732 733 /* Fill the upper triangular matrix */ 734 AiUp[0]=(PetscInt) 0; 735 AiUp[n]=nzUpper; 736 offset = 0; 737 for (i=0; i<n; i++) { 738 /* set the pointers */ 739 v = aa + ai[i]; 740 vj = aj + ai[i]; 741 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 742 743 /* first, set the diagonal elements */ 744 AjUp[offset] = (PetscInt) i; 745 AAUp[offset] = (MatScalar)1.0/v[nz]; 746 AiUp[i] = offset; 747 AALo[offset] = (MatScalar)1.0/v[nz]; 748 749 offset+=1; 750 if (nz>0) { 751 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 752 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 753 for (j=offset; j<offset+nz; j++) { 754 AAUp[j] = -AAUp[j]; 755 AALo[j] = AAUp[j]/v[nz]; 756 } 757 offset+=nz; 758 } 759 } 760 761 /* allocate space for the triangular factor information */ 762 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 763 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 764 765 /* Create the matrix description */ 766 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 767 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 768 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 769 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 770 #else 771 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 772 #endif 773 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 774 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 775 776 /* set the matrix */ 777 upTriFactor->csrMat = new CsrMatrix; 778 upTriFactor->csrMat->num_rows = A->rmap->n; 779 upTriFactor->csrMat->num_cols = A->cmap->n; 780 upTriFactor->csrMat->num_entries = a->nz; 781 782 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 783 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 784 785 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 786 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 787 788 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 789 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 790 791 /* set the operation */ 792 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 793 794 /* Create the solve analysis information */ 795 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 796 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 797 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 798 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 799 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 800 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 801 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 802 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 803 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 804 #endif 805 806 /* perform the solve analysis */ 807 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 808 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 809 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 810 upTriFactor->csrMat->column_indices->data().get(), 811 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 812 upTriFactor->solveInfo, 813 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 814 #else 815 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 816 #endif 817 cerr = WaitForCUDA();CHKERRCUDA(cerr); 818 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 819 820 /* assign the pointer */ 821 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 822 823 /* allocate space for the triangular factor information */ 824 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 825 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 826 827 /* Create the matrix description */ 828 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 829 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 830 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 831 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 832 #else 833 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 834 #endif 835 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 836 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 837 838 /* set the operation */ 839 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 840 841 /* set the matrix */ 842 loTriFactor->csrMat = new CsrMatrix; 843 loTriFactor->csrMat->num_rows = A->rmap->n; 844 loTriFactor->csrMat->num_cols = A->cmap->n; 845 loTriFactor->csrMat->num_entries = a->nz; 846 847 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 848 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 849 850 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 851 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 852 853 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 854 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 855 856 /* Create the solve analysis information */ 857 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 858 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 859 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 860 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 861 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 862 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 863 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 864 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 865 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 866 #endif 867 868 /* perform the solve analysis */ 869 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 870 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 871 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 872 loTriFactor->csrMat->column_indices->data().get(), 873 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 874 loTriFactor->solveInfo, 875 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 876 #else 877 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 878 #endif 879 cerr = WaitForCUDA();CHKERRCUDA(cerr); 880 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 881 882 /* assign the pointer */ 883 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 884 885 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 886 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 887 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 888 } else { 889 /* Fill the upper triangular matrix */ 890 offset = 0; 891 for (i=0; i<n; i++) { 892 /* set the pointers */ 893 v = aa + ai[i]; 894 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 895 896 /* first, set the diagonal elements */ 897 AAUp[offset] = 1.0/v[nz]; 898 AALo[offset] = 1.0/v[nz]; 899 900 offset+=1; 901 if (nz>0) { 902 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 903 for (j=offset; j<offset+nz; j++) { 904 AAUp[j] = -AAUp[j]; 905 AALo[j] = AAUp[j]/v[nz]; 906 } 907 offset+=nz; 908 } 909 } 910 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 911 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 912 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 913 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 914 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 915 } 916 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 917 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 918 } catch(char *ex) { 919 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 920 } 921 } 922 PetscFunctionReturn(0); 923 } 924 925 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 926 { 927 PetscErrorCode ierr; 928 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 929 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 930 IS ip = a->row; 931 PetscBool perm_identity; 932 PetscInt n = A->rmap->n; 933 934 PetscFunctionBegin; 935 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 936 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 937 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 938 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 939 940 A->offloadmask = PETSC_OFFLOAD_BOTH; 941 942 /* lower triangular indices */ 943 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 944 if (!perm_identity) { 945 IS iip; 946 const PetscInt *irip,*rip; 947 948 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 949 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 950 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 951 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 952 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 953 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 954 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 955 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 956 ierr = ISDestroy(&iip);CHKERRQ(ierr); 957 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 958 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 959 } 960 PetscFunctionReturn(0); 961 } 962 963 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 964 { 965 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 966 IS ip = b->row; 967 PetscBool perm_identity; 968 PetscErrorCode ierr; 969 970 PetscFunctionBegin; 971 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 972 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 973 B->offloadmask = PETSC_OFFLOAD_CPU; 974 /* determine which version of MatSolve needs to be used. */ 975 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 976 if (perm_identity) { 977 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 978 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 979 B->ops->matsolve = NULL; 980 B->ops->matsolvetranspose = NULL; 981 } else { 982 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 983 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 984 B->ops->matsolve = NULL; 985 B->ops->matsolvetranspose = NULL; 986 } 987 988 /* get the triangular factors */ 989 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 990 PetscFunctionReturn(0); 991 } 992 993 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 994 { 995 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 996 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 997 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 998 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 999 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1000 cusparseStatus_t stat; 1001 cusparseIndexBase_t indexBase; 1002 cusparseMatrixType_t matrixType; 1003 cusparseFillMode_t fillMode; 1004 cusparseDiagType_t diagType; 1005 cudaError_t cerr; 1006 PetscErrorCode ierr; 1007 1008 PetscFunctionBegin; 1009 /* allocate space for the transpose of the lower triangular factor */ 1010 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1011 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1012 1013 /* set the matrix descriptors of the lower triangular factor */ 1014 matrixType = cusparseGetMatType(loTriFactor->descr); 1015 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1016 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1017 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1018 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1019 1020 /* Create the matrix description */ 1021 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1022 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1023 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1024 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1025 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1026 1027 /* set the operation */ 1028 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1029 1030 /* allocate GPU space for the CSC of the lower triangular factor*/ 1031 loTriFactorT->csrMat = new CsrMatrix; 1032 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1033 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1034 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1035 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1036 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1037 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1038 1039 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1040 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1041 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1042 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1043 loTriFactor->csrMat->values->data().get(), 1044 loTriFactor->csrMat->row_offsets->data().get(), 1045 loTriFactor->csrMat->column_indices->data().get(), 1046 loTriFactorT->csrMat->values->data().get(), 1047 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1048 CUSPARSE_ACTION_NUMERIC,indexBase, 1049 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1050 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1051 #endif 1052 1053 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1054 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1055 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1056 loTriFactor->csrMat->values->data().get(), 1057 loTriFactor->csrMat->row_offsets->data().get(), 1058 loTriFactor->csrMat->column_indices->data().get(), 1059 loTriFactorT->csrMat->values->data().get(), 1060 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1061 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1062 CUSPARSE_ACTION_NUMERIC, indexBase, 1063 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1064 #else 1065 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1066 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1067 #endif 1068 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1069 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1070 1071 /* Create the solve analysis information */ 1072 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1073 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1074 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1075 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1076 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1077 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1078 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1079 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1080 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1081 #endif 1082 1083 /* perform the solve analysis */ 1084 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1085 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1086 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1087 loTriFactorT->csrMat->column_indices->data().get(), 1088 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1089 loTriFactorT->solveInfo, 1090 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1091 #else 1092 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1093 #endif 1094 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1095 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1096 1097 /* assign the pointer */ 1098 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1099 1100 /*********************************************/ 1101 /* Now the Transpose of the Upper Tri Factor */ 1102 /*********************************************/ 1103 1104 /* allocate space for the transpose of the upper triangular factor */ 1105 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1106 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1107 1108 /* set the matrix descriptors of the upper triangular factor */ 1109 matrixType = cusparseGetMatType(upTriFactor->descr); 1110 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1111 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1112 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1113 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1114 1115 /* Create the matrix description */ 1116 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1117 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1118 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1119 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1120 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1121 1122 /* set the operation */ 1123 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1124 1125 /* allocate GPU space for the CSC of the upper triangular factor*/ 1126 upTriFactorT->csrMat = new CsrMatrix; 1127 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1128 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1129 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1130 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1131 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1132 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1133 1134 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1135 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1136 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1137 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1138 upTriFactor->csrMat->values->data().get(), 1139 upTriFactor->csrMat->row_offsets->data().get(), 1140 upTriFactor->csrMat->column_indices->data().get(), 1141 upTriFactorT->csrMat->values->data().get(), 1142 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1143 CUSPARSE_ACTION_NUMERIC,indexBase, 1144 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1145 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1146 #endif 1147 1148 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1149 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1150 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1151 upTriFactor->csrMat->values->data().get(), 1152 upTriFactor->csrMat->row_offsets->data().get(), 1153 upTriFactor->csrMat->column_indices->data().get(), 1154 upTriFactorT->csrMat->values->data().get(), 1155 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1156 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1157 CUSPARSE_ACTION_NUMERIC, indexBase, 1158 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1159 #else 1160 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1161 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1162 #endif 1163 1164 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1165 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1166 1167 /* Create the solve analysis information */ 1168 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1169 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1170 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1171 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1172 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1173 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1174 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1175 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1176 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1177 #endif 1178 1179 /* perform the solve analysis */ 1180 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1181 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1182 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1183 upTriFactorT->csrMat->column_indices->data().get(), 1184 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1185 upTriFactorT->solveInfo, 1186 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1187 #else 1188 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1189 #endif 1190 1191 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1192 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1193 1194 /* assign the pointer */ 1195 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1196 PetscFunctionReturn(0); 1197 } 1198 1199 struct PetscScalarToPetscInt 1200 { 1201 __host__ __device__ 1202 PetscInt operator()(PetscScalar s) 1203 { 1204 return (PetscInt)PetscRealPart(s); 1205 } 1206 }; 1207 1208 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1209 { 1210 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1211 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1212 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1213 cusparseStatus_t stat; 1214 cusparseIndexBase_t indexBase; 1215 cudaError_t err; 1216 PetscErrorCode ierr; 1217 1218 PetscFunctionBegin; 1219 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1220 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1221 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1222 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1223 if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1224 if (A->transupdated) PetscFunctionReturn(0); 1225 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1226 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1227 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1228 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1229 } 1230 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1231 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1232 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1233 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1234 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1235 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1236 1237 /* set alpha and beta */ 1238 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1239 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1240 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1241 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1242 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1243 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1244 1245 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1246 CsrMatrix *matrixT = new CsrMatrix; 1247 matstructT->mat = matrixT; 1248 matrixT->num_rows = A->cmap->n; 1249 matrixT->num_cols = A->rmap->n; 1250 matrixT->num_entries = a->nz; 1251 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1252 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1253 matrixT->values = new THRUSTARRAY(a->nz); 1254 1255 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1256 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1257 1258 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1259 #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1260 stat = cusparseCreateCsr(&matstructT->matDescr, 1261 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1262 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1263 matrixT->values->data().get(), 1264 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1265 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1266 #else 1267 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1268 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1269 1270 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1271 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1272 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1273 */ 1274 if (matrixT->num_entries) { 1275 stat = cusparseCreateCsr(&matstructT->matDescr, 1276 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1277 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1278 matrixT->values->data().get(), 1279 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 1280 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1281 1282 } else { 1283 matstructT->matDescr = NULL; 1284 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1285 } 1286 #endif 1287 #endif 1288 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1289 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1290 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1291 #else 1292 CsrMatrix *temp = new CsrMatrix; 1293 CsrMatrix *tempT = new CsrMatrix; 1294 /* First convert HYB to CSR */ 1295 temp->num_rows = A->rmap->n; 1296 temp->num_cols = A->cmap->n; 1297 temp->num_entries = a->nz; 1298 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1299 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1300 temp->values = new THRUSTARRAY(a->nz); 1301 1302 stat = cusparse_hyb2csr(cusparsestruct->handle, 1303 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1304 temp->values->data().get(), 1305 temp->row_offsets->data().get(), 1306 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1307 1308 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1309 tempT->num_rows = A->rmap->n; 1310 tempT->num_cols = A->cmap->n; 1311 tempT->num_entries = a->nz; 1312 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1313 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1314 tempT->values = new THRUSTARRAY(a->nz); 1315 1316 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1317 temp->num_cols, temp->num_entries, 1318 temp->values->data().get(), 1319 temp->row_offsets->data().get(), 1320 temp->column_indices->data().get(), 1321 tempT->values->data().get(), 1322 tempT->column_indices->data().get(), 1323 tempT->row_offsets->data().get(), 1324 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1325 1326 /* Last, convert CSC to HYB */ 1327 cusparseHybMat_t hybMat; 1328 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1329 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1330 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1331 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1332 matstructT->descr, tempT->values->data().get(), 1333 tempT->row_offsets->data().get(), 1334 tempT->column_indices->data().get(), 1335 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1336 1337 /* assign the pointer */ 1338 matstructT->mat = hybMat; 1339 A->transupdated = PETSC_TRUE; 1340 /* delete temporaries */ 1341 if (tempT) { 1342 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1343 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1344 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1345 delete (CsrMatrix*) tempT; 1346 } 1347 if (temp) { 1348 if (temp->values) delete (THRUSTARRAY*) temp->values; 1349 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1350 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1351 delete (CsrMatrix*) temp; 1352 } 1353 #endif 1354 } 1355 } 1356 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1357 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1358 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1359 if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1360 if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1361 if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1362 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1363 if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1364 if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1365 if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1366 if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1367 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1368 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1369 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1370 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1371 } 1372 if (!cusparsestruct->csr2csc_i) { 1373 THRUSTARRAY csr2csc_a(matrix->num_entries); 1374 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1375 1376 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1377 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1378 void *csr2cscBuffer; 1379 size_t csr2cscBufferSize; 1380 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1381 A->cmap->n, matrix->num_entries, 1382 matrix->values->data().get(), 1383 cusparsestruct->rowoffsets_gpu->data().get(), 1384 matrix->column_indices->data().get(), 1385 matrixT->values->data().get(), 1386 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1387 CUSPARSE_ACTION_NUMERIC,indexBase, 1388 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1389 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1390 #endif 1391 1392 if (matrix->num_entries) { 1393 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1394 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1395 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1396 1397 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1398 should be filled with indexBase. So I just take a shortcut here. 1399 */ 1400 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1401 A->cmap->n,matrix->num_entries, 1402 csr2csc_a.data().get(), 1403 cusparsestruct->rowoffsets_gpu->data().get(), 1404 matrix->column_indices->data().get(), 1405 matrixT->values->data().get(), 1406 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1407 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1408 CUSPARSE_ACTION_NUMERIC,indexBase, 1409 cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1410 #else 1411 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1412 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1413 #endif 1414 } else { 1415 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1416 } 1417 1418 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1419 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1420 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1421 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1422 #endif 1423 } 1424 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1425 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1426 matrixT->values->begin())); 1427 } 1428 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1429 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1430 /* the compressed row indices is not used for matTranspose */ 1431 matstructT->cprowIndices = NULL; 1432 /* assign the pointer */ 1433 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1434 A->transupdated = PETSC_TRUE; 1435 PetscFunctionReturn(0); 1436 } 1437 1438 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1439 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1440 { 1441 PetscInt n = xx->map->n; 1442 const PetscScalar *barray; 1443 PetscScalar *xarray; 1444 thrust::device_ptr<const PetscScalar> bGPU; 1445 thrust::device_ptr<PetscScalar> xGPU; 1446 cusparseStatus_t stat; 1447 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1448 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1449 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1450 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1451 PetscErrorCode ierr; 1452 1453 PetscFunctionBegin; 1454 /* Analyze the matrix and create the transpose ... on the fly */ 1455 if (!loTriFactorT && !upTriFactorT) { 1456 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1457 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1458 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1459 } 1460 1461 /* Get the GPU pointers */ 1462 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1463 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1464 xGPU = thrust::device_pointer_cast(xarray); 1465 bGPU = thrust::device_pointer_cast(barray); 1466 1467 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1468 /* First, reorder with the row permutation */ 1469 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1470 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1471 xGPU); 1472 1473 /* First, solve U */ 1474 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1475 upTriFactorT->csrMat->num_rows, 1476 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1477 upTriFactorT->csrMat->num_entries, 1478 #endif 1479 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1480 upTriFactorT->csrMat->values->data().get(), 1481 upTriFactorT->csrMat->row_offsets->data().get(), 1482 upTriFactorT->csrMat->column_indices->data().get(), 1483 upTriFactorT->solveInfo, 1484 xarray, 1485 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1486 tempGPU->data().get(), 1487 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1488 #else 1489 tempGPU->data().get());CHKERRCUSPARSE(stat); 1490 #endif 1491 1492 /* Then, solve L */ 1493 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1494 loTriFactorT->csrMat->num_rows, 1495 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1496 loTriFactorT->csrMat->num_entries, 1497 #endif 1498 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1499 loTriFactorT->csrMat->values->data().get(), 1500 loTriFactorT->csrMat->row_offsets->data().get(), 1501 loTriFactorT->csrMat->column_indices->data().get(), 1502 loTriFactorT->solveInfo, 1503 tempGPU->data().get(), 1504 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1505 xarray, 1506 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1507 #else 1508 xarray);CHKERRCUSPARSE(stat); 1509 #endif 1510 1511 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1512 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1513 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1514 tempGPU->begin()); 1515 1516 /* Copy the temporary to the full solution. */ 1517 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1518 1519 /* restore */ 1520 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1521 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1522 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1523 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1524 PetscFunctionReturn(0); 1525 } 1526 1527 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1528 { 1529 const PetscScalar *barray; 1530 PetscScalar *xarray; 1531 cusparseStatus_t stat; 1532 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1533 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1534 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1535 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1536 PetscErrorCode ierr; 1537 1538 PetscFunctionBegin; 1539 /* Analyze the matrix and create the transpose ... on the fly */ 1540 if (!loTriFactorT && !upTriFactorT) { 1541 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1542 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1543 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1544 } 1545 1546 /* Get the GPU pointers */ 1547 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1548 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1549 1550 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1551 /* First, solve U */ 1552 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1553 upTriFactorT->csrMat->num_rows, 1554 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1555 upTriFactorT->csrMat->num_entries, 1556 #endif 1557 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1558 upTriFactorT->csrMat->values->data().get(), 1559 upTriFactorT->csrMat->row_offsets->data().get(), 1560 upTriFactorT->csrMat->column_indices->data().get(), 1561 upTriFactorT->solveInfo, 1562 barray, 1563 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1564 tempGPU->data().get(), 1565 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1566 #else 1567 tempGPU->data().get());CHKERRCUSPARSE(stat); 1568 #endif 1569 1570 /* Then, solve L */ 1571 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1572 loTriFactorT->csrMat->num_rows, 1573 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1574 loTriFactorT->csrMat->num_entries, 1575 #endif 1576 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1577 loTriFactorT->csrMat->values->data().get(), 1578 loTriFactorT->csrMat->row_offsets->data().get(), 1579 loTriFactorT->csrMat->column_indices->data().get(), 1580 loTriFactorT->solveInfo, 1581 tempGPU->data().get(), 1582 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1583 xarray, 1584 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1585 #else 1586 xarray);CHKERRCUSPARSE(stat); 1587 #endif 1588 1589 /* restore */ 1590 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1591 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1592 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1593 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1594 PetscFunctionReturn(0); 1595 } 1596 1597 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1598 { 1599 const PetscScalar *barray; 1600 PetscScalar *xarray; 1601 thrust::device_ptr<const PetscScalar> bGPU; 1602 thrust::device_ptr<PetscScalar> xGPU; 1603 cusparseStatus_t stat; 1604 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1605 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1606 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1607 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1608 PetscErrorCode ierr; 1609 1610 PetscFunctionBegin; 1611 1612 /* Get the GPU pointers */ 1613 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1614 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1615 xGPU = thrust::device_pointer_cast(xarray); 1616 bGPU = thrust::device_pointer_cast(barray); 1617 1618 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1619 /* First, reorder with the row permutation */ 1620 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1621 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1622 tempGPU->begin()); 1623 1624 /* Next, solve L */ 1625 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1626 loTriFactor->csrMat->num_rows, 1627 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1628 loTriFactor->csrMat->num_entries, 1629 #endif 1630 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1631 loTriFactor->csrMat->values->data().get(), 1632 loTriFactor->csrMat->row_offsets->data().get(), 1633 loTriFactor->csrMat->column_indices->data().get(), 1634 loTriFactor->solveInfo, 1635 tempGPU->data().get(), 1636 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1637 xarray, 1638 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1639 #else 1640 xarray);CHKERRCUSPARSE(stat); 1641 #endif 1642 1643 /* Then, solve U */ 1644 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1645 upTriFactor->csrMat->num_rows, 1646 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1647 upTriFactor->csrMat->num_entries, 1648 #endif 1649 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1650 upTriFactor->csrMat->values->data().get(), 1651 upTriFactor->csrMat->row_offsets->data().get(), 1652 upTriFactor->csrMat->column_indices->data().get(), 1653 upTriFactor->solveInfo,xarray, 1654 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1655 tempGPU->data().get(), 1656 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1657 #else 1658 tempGPU->data().get());CHKERRCUSPARSE(stat); 1659 #endif 1660 1661 /* Last, reorder with the column permutation */ 1662 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1663 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1664 xGPU); 1665 1666 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1667 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1668 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1669 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1670 PetscFunctionReturn(0); 1671 } 1672 1673 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1674 { 1675 const PetscScalar *barray; 1676 PetscScalar *xarray; 1677 cusparseStatus_t stat; 1678 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1679 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1680 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1681 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1682 PetscErrorCode ierr; 1683 1684 PetscFunctionBegin; 1685 /* Get the GPU pointers */ 1686 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1687 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1688 1689 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1690 /* First, solve L */ 1691 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1692 loTriFactor->csrMat->num_rows, 1693 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1694 loTriFactor->csrMat->num_entries, 1695 #endif 1696 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1697 loTriFactor->csrMat->values->data().get(), 1698 loTriFactor->csrMat->row_offsets->data().get(), 1699 loTriFactor->csrMat->column_indices->data().get(), 1700 loTriFactor->solveInfo, 1701 barray, 1702 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1703 tempGPU->data().get(), 1704 loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1705 #else 1706 tempGPU->data().get());CHKERRCUSPARSE(stat); 1707 #endif 1708 1709 /* Next, solve U */ 1710 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1711 upTriFactor->csrMat->num_rows, 1712 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1713 upTriFactor->csrMat->num_entries, 1714 #endif 1715 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1716 upTriFactor->csrMat->values->data().get(), 1717 upTriFactor->csrMat->row_offsets->data().get(), 1718 upTriFactor->csrMat->column_indices->data().get(), 1719 upTriFactor->solveInfo, 1720 tempGPU->data().get(), 1721 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1722 xarray, 1723 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1724 #else 1725 xarray);CHKERRCUSPARSE(stat); 1726 #endif 1727 1728 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1729 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1730 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1731 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1732 PetscFunctionReturn(0); 1733 } 1734 1735 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1736 { 1737 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1738 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1739 cudaError_t cerr; 1740 PetscErrorCode ierr; 1741 1742 PetscFunctionBegin; 1743 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1744 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1745 1746 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1747 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1748 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1749 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1750 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1751 A->offloadmask = PETSC_OFFLOAD_BOTH; 1752 } 1753 PetscFunctionReturn(0); 1754 } 1755 1756 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1757 { 1758 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1759 PetscErrorCode ierr; 1760 1761 PetscFunctionBegin; 1762 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1763 *array = a->a; 1764 A->offloadmask = PETSC_OFFLOAD_CPU; 1765 PetscFunctionReturn(0); 1766 } 1767 1768 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1769 { 1770 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1771 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1772 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1773 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1774 PetscErrorCode ierr; 1775 cusparseStatus_t stat; 1776 PetscBool both = PETSC_TRUE; 1777 cudaError_t err; 1778 1779 PetscFunctionBegin; 1780 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1781 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1782 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1783 CsrMatrix *matrix; 1784 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1785 1786 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1787 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1788 matrix->values->assign(a->a, a->a+a->nz); 1789 err = WaitForCUDA();CHKERRCUDA(err); 1790 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1791 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1792 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1793 } else { 1794 PetscInt nnz; 1795 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1796 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1797 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1798 delete cusparsestruct->workVector; 1799 delete cusparsestruct->rowoffsets_gpu; 1800 cusparsestruct->workVector = NULL; 1801 cusparsestruct->rowoffsets_gpu = NULL; 1802 try { 1803 if (a->compressedrow.use) { 1804 m = a->compressedrow.nrows; 1805 ii = a->compressedrow.i; 1806 ridx = a->compressedrow.rindex; 1807 } else { 1808 m = A->rmap->n; 1809 ii = a->i; 1810 ridx = NULL; 1811 } 1812 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1813 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1814 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1815 else nnz = a->nz; 1816 1817 /* create cusparse matrix */ 1818 cusparsestruct->nrows = m; 1819 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1820 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1821 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1822 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1823 1824 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1825 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1826 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1827 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1828 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1829 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1830 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1831 1832 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1833 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1834 /* set the matrix */ 1835 CsrMatrix *mat= new CsrMatrix; 1836 mat->num_rows = m; 1837 mat->num_cols = A->cmap->n; 1838 mat->num_entries = nnz; 1839 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1840 mat->row_offsets->assign(ii, ii + m+1); 1841 1842 mat->column_indices = new THRUSTINTARRAY32(nnz); 1843 mat->column_indices->assign(a->j, a->j+nnz); 1844 1845 mat->values = new THRUSTARRAY(nnz); 1846 if (a->a) mat->values->assign(a->a, a->a+nnz); 1847 1848 /* assign the pointer */ 1849 matstruct->mat = mat; 1850 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1851 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1852 stat = cusparseCreateCsr(&matstruct->matDescr, 1853 mat->num_rows, mat->num_cols, mat->num_entries, 1854 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1855 mat->values->data().get(), 1856 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1857 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1858 } 1859 #endif 1860 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1861 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1862 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1863 #else 1864 CsrMatrix *mat= new CsrMatrix; 1865 mat->num_rows = m; 1866 mat->num_cols = A->cmap->n; 1867 mat->num_entries = nnz; 1868 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1869 mat->row_offsets->assign(ii, ii + m+1); 1870 1871 mat->column_indices = new THRUSTINTARRAY32(nnz); 1872 mat->column_indices->assign(a->j, a->j+nnz); 1873 1874 mat->values = new THRUSTARRAY(nnz); 1875 if (a->a) mat->values->assign(a->a, a->a+nnz); 1876 1877 cusparseHybMat_t hybMat; 1878 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1879 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1880 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1881 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1882 matstruct->descr, mat->values->data().get(), 1883 mat->row_offsets->data().get(), 1884 mat->column_indices->data().get(), 1885 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1886 /* assign the pointer */ 1887 matstruct->mat = hybMat; 1888 1889 if (mat) { 1890 if (mat->values) delete (THRUSTARRAY*)mat->values; 1891 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1892 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1893 delete (CsrMatrix*)mat; 1894 } 1895 #endif 1896 } 1897 1898 /* assign the compressed row indices */ 1899 if (a->compressedrow.use) { 1900 cusparsestruct->workVector = new THRUSTARRAY(m); 1901 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1902 matstruct->cprowIndices->assign(ridx,ridx+m); 1903 tmp = m; 1904 } else { 1905 cusparsestruct->workVector = NULL; 1906 matstruct->cprowIndices = NULL; 1907 tmp = 0; 1908 } 1909 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1910 1911 /* assign the pointer */ 1912 cusparsestruct->mat = matstruct; 1913 } catch(char *ex) { 1914 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1915 } 1916 err = WaitForCUDA();CHKERRCUDA(err); 1917 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1918 cusparsestruct->nonzerostate = A->nonzerostate; 1919 } 1920 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1921 } 1922 PetscFunctionReturn(0); 1923 } 1924 1925 struct VecCUDAPlusEquals 1926 { 1927 template <typename Tuple> 1928 __host__ __device__ 1929 void operator()(Tuple t) 1930 { 1931 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1932 } 1933 }; 1934 1935 struct VecCUDAEquals 1936 { 1937 template <typename Tuple> 1938 __host__ __device__ 1939 void operator()(Tuple t) 1940 { 1941 thrust::get<1>(t) = thrust::get<0>(t); 1942 } 1943 }; 1944 1945 struct VecCUDAEqualsReverse 1946 { 1947 template <typename Tuple> 1948 __host__ __device__ 1949 void operator()(Tuple t) 1950 { 1951 thrust::get<0>(t) = thrust::get<1>(t); 1952 } 1953 }; 1954 1955 struct MatMatCusparse { 1956 PetscBool cisdense; 1957 PetscScalar *Bt; 1958 Mat X; 1959 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1960 PetscLogDouble flops; 1961 CsrMatrix *Bcsr; 1962 1963 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1964 cusparseSpMatDescr_t matSpBDescr; 1965 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1966 cusparseDnMatDescr_t matBDescr; 1967 cusparseDnMatDescr_t matCDescr; 1968 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1969 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 1970 void *dBuffer4; 1971 void *dBuffer5; 1972 #endif 1973 size_t mmBufferSize; 1974 void *mmBuffer; 1975 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1976 cusparseSpGEMMDescr_t spgemmDesc; 1977 #endif 1978 }; 1979 1980 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1981 { 1982 PetscErrorCode ierr; 1983 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1984 cudaError_t cerr; 1985 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1986 cusparseStatus_t stat; 1987 #endif 1988 1989 PetscFunctionBegin; 1990 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1991 delete mmdata->Bcsr; 1992 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1993 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1994 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1995 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1996 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1997 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 1998 if (mmdata->dBuffer4) { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); } 1999 if (mmdata->dBuffer5) { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); } 2000 #endif 2001 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 2002 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 2003 #endif 2004 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 2005 ierr = PetscFree(data);CHKERRQ(ierr); 2006 PetscFunctionReturn(0); 2007 } 2008 2009 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2010 2011 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2012 { 2013 Mat_Product *product = C->product; 2014 Mat A,B; 2015 PetscInt m,n,blda,clda; 2016 PetscBool flg,biscuda; 2017 Mat_SeqAIJCUSPARSE *cusp; 2018 cusparseStatus_t stat; 2019 cusparseOperation_t opA; 2020 const PetscScalar *barray; 2021 PetscScalar *carray; 2022 PetscErrorCode ierr; 2023 MatMatCusparse *mmdata; 2024 Mat_SeqAIJCUSPARSEMultStruct *mat; 2025 CsrMatrix *csrmat; 2026 2027 PetscFunctionBegin; 2028 MatCheckProduct(C,1); 2029 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2030 mmdata = (MatMatCusparse*)product->data; 2031 A = product->A; 2032 B = product->B; 2033 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2034 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2035 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2036 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2037 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2038 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2039 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2040 switch (product->type) { 2041 case MATPRODUCT_AB: 2042 case MATPRODUCT_PtAP: 2043 mat = cusp->mat; 2044 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2045 m = A->rmap->n; 2046 n = B->cmap->n; 2047 break; 2048 case MATPRODUCT_AtB: 2049 if (!A->form_explicit_transpose) { 2050 mat = cusp->mat; 2051 opA = CUSPARSE_OPERATION_TRANSPOSE; 2052 } else { 2053 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2054 mat = cusp->matTranspose; 2055 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2056 } 2057 m = A->cmap->n; 2058 n = B->cmap->n; 2059 break; 2060 case MATPRODUCT_ABt: 2061 case MATPRODUCT_RARt: 2062 mat = cusp->mat; 2063 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2064 m = A->rmap->n; 2065 n = B->rmap->n; 2066 break; 2067 default: 2068 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2069 } 2070 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2071 csrmat = (CsrMatrix*)mat->mat; 2072 /* if the user passed a CPU matrix, copy the data to the GPU */ 2073 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2074 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2075 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2076 2077 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2078 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2079 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2080 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2081 } else { 2082 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2083 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2084 } 2085 2086 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2087 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2088 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2089 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2090 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2091 size_t mmBufferSize; 2092 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2093 if (!mmdata->matBDescr) { 2094 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2095 mmdata->Blda = blda; 2096 } 2097 2098 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2099 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2100 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2101 mmdata->Clda = clda; 2102 } 2103 2104 if (!mat->matDescr) { 2105 stat = cusparseCreateCsr(&mat->matDescr, 2106 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2107 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2108 csrmat->values->data().get(), 2109 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2110 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2111 } 2112 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2113 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2114 mmdata->matCDescr,cusparse_scalartype, 2115 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2116 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2117 cudaError_t cerr; 2118 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2119 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2120 mmdata->mmBufferSize = mmBufferSize; 2121 } 2122 mmdata->initialized = PETSC_TRUE; 2123 } else { 2124 /* to be safe, always update pointers of the mats */ 2125 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2126 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2127 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2128 } 2129 2130 /* do cusparseSpMM, which supports transpose on B */ 2131 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2132 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2133 mmdata->matCDescr,cusparse_scalartype, 2134 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2135 #else 2136 PetscInt k; 2137 /* cusparseXcsrmm does not support transpose on B */ 2138 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2139 cublasHandle_t cublasv2handle; 2140 cublasStatus_t cerr; 2141 2142 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2143 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2144 B->cmap->n,B->rmap->n, 2145 &PETSC_CUSPARSE_ONE ,barray,blda, 2146 &PETSC_CUSPARSE_ZERO,barray,blda, 2147 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2148 blda = B->cmap->n; 2149 k = B->cmap->n; 2150 } else { 2151 k = B->rmap->n; 2152 } 2153 2154 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2155 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2156 csrmat->num_entries,mat->alpha_one,mat->descr, 2157 csrmat->values->data().get(), 2158 csrmat->row_offsets->data().get(), 2159 csrmat->column_indices->data().get(), 2160 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2161 carray,clda);CHKERRCUSPARSE(stat); 2162 #endif 2163 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2164 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2165 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2166 if (product->type == MATPRODUCT_RARt) { 2167 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2168 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2169 } else if (product->type == MATPRODUCT_PtAP) { 2170 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2171 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2172 } else { 2173 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2174 } 2175 if (mmdata->cisdense) { 2176 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2177 } 2178 if (!biscuda) { 2179 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2180 } 2181 PetscFunctionReturn(0); 2182 } 2183 2184 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2185 { 2186 Mat_Product *product = C->product; 2187 Mat A,B; 2188 PetscInt m,n; 2189 PetscBool cisdense,flg; 2190 PetscErrorCode ierr; 2191 MatMatCusparse *mmdata; 2192 Mat_SeqAIJCUSPARSE *cusp; 2193 2194 PetscFunctionBegin; 2195 MatCheckProduct(C,1); 2196 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2197 A = product->A; 2198 B = product->B; 2199 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2200 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2201 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2202 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2203 switch (product->type) { 2204 case MATPRODUCT_AB: 2205 m = A->rmap->n; 2206 n = B->cmap->n; 2207 break; 2208 case MATPRODUCT_AtB: 2209 m = A->cmap->n; 2210 n = B->cmap->n; 2211 break; 2212 case MATPRODUCT_ABt: 2213 m = A->rmap->n; 2214 n = B->rmap->n; 2215 break; 2216 case MATPRODUCT_PtAP: 2217 m = B->cmap->n; 2218 n = B->cmap->n; 2219 break; 2220 case MATPRODUCT_RARt: 2221 m = B->rmap->n; 2222 n = B->rmap->n; 2223 break; 2224 default: 2225 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2226 } 2227 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2228 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2229 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2230 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2231 2232 /* product data */ 2233 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2234 mmdata->cisdense = cisdense; 2235 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2236 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2237 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2238 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2239 } 2240 #endif 2241 /* for these products we need intermediate storage */ 2242 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2243 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2244 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2245 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2246 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2247 } else { 2248 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2249 } 2250 } 2251 C->product->data = mmdata; 2252 C->product->destroy = MatDestroy_MatMatCusparse; 2253 2254 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2255 PetscFunctionReturn(0); 2256 } 2257 2258 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2259 { 2260 Mat_Product *product = C->product; 2261 Mat A,B; 2262 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2263 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2264 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2265 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2266 PetscBool flg; 2267 PetscErrorCode ierr; 2268 cusparseStatus_t stat; 2269 cudaError_t cerr; 2270 MatProductType ptype; 2271 MatMatCusparse *mmdata; 2272 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2273 cusparseSpMatDescr_t BmatSpDescr; 2274 #endif 2275 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2276 2277 PetscFunctionBegin; 2278 MatCheckProduct(C,1); 2279 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2280 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2281 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2282 mmdata = (MatMatCusparse*)C->product->data; 2283 A = product->A; 2284 B = product->B; 2285 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2286 mmdata->reusesym = PETSC_FALSE; 2287 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2288 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2289 Cmat = Ccusp->mat; 2290 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2291 Ccsr = (CsrMatrix*)Cmat->mat; 2292 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2293 goto finalize; 2294 } 2295 if (!c->nz) goto finalize; 2296 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2297 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2298 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2299 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2300 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2301 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2302 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2303 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2304 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2305 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2306 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2307 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2308 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2309 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2310 2311 ptype = product->type; 2312 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2313 ptype = MATPRODUCT_AB; 2314 if (!product->symbolic_used_the_fact_A_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 2315 } 2316 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2317 ptype = MATPRODUCT_AB; 2318 if (!product->symbolic_used_the_fact_B_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 2319 } 2320 switch (ptype) { 2321 case MATPRODUCT_AB: 2322 Amat = Acusp->mat; 2323 Bmat = Bcusp->mat; 2324 break; 2325 case MATPRODUCT_AtB: 2326 Amat = Acusp->matTranspose; 2327 Bmat = Bcusp->mat; 2328 break; 2329 case MATPRODUCT_ABt: 2330 Amat = Acusp->mat; 2331 Bmat = Bcusp->matTranspose; 2332 break; 2333 default: 2334 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2335 } 2336 Cmat = Ccusp->mat; 2337 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2338 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2339 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2340 Acsr = (CsrMatrix*)Amat->mat; 2341 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2342 Ccsr = (CsrMatrix*)Cmat->mat; 2343 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2344 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2345 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2346 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2347 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2348 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2349 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2350 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2351 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2352 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2353 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2354 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2355 #else 2356 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2357 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2358 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2359 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2360 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2361 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2362 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2363 #endif 2364 #else 2365 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2366 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2367 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2368 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2369 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2370 #endif 2371 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2372 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2373 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2374 C->offloadmask = PETSC_OFFLOAD_GPU; 2375 finalize: 2376 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2377 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2378 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2379 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2380 c->reallocs = 0; 2381 C->info.mallocs += 0; 2382 C->info.nz_unneeded = 0; 2383 C->assembled = C->was_assembled = PETSC_TRUE; 2384 C->num_ass++; 2385 PetscFunctionReturn(0); 2386 } 2387 2388 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2389 { 2390 Mat_Product *product = C->product; 2391 Mat A,B; 2392 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2393 Mat_SeqAIJ *a,*b,*c; 2394 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2395 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2396 PetscInt i,j,m,n,k; 2397 PetscBool flg; 2398 PetscErrorCode ierr; 2399 cusparseStatus_t stat; 2400 cudaError_t cerr; 2401 MatProductType ptype; 2402 MatMatCusparse *mmdata; 2403 PetscLogDouble flops; 2404 PetscBool biscompressed,ciscompressed; 2405 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2406 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2407 cusparseSpMatDescr_t BmatSpDescr; 2408 #else 2409 int cnz; 2410 #endif 2411 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2412 2413 PetscFunctionBegin; 2414 MatCheckProduct(C,1); 2415 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2416 A = product->A; 2417 B = product->B; 2418 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2419 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2420 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2421 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2422 a = (Mat_SeqAIJ*)A->data; 2423 b = (Mat_SeqAIJ*)B->data; 2424 /* product data */ 2425 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2426 C->product->data = mmdata; 2427 C->product->destroy = MatDestroy_MatMatCusparse; 2428 2429 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2430 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2431 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2432 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2433 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2434 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2435 2436 ptype = product->type; 2437 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2438 ptype = MATPRODUCT_AB; 2439 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2440 } 2441 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2442 ptype = MATPRODUCT_AB; 2443 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2444 } 2445 biscompressed = PETSC_FALSE; 2446 ciscompressed = PETSC_FALSE; 2447 switch (ptype) { 2448 case MATPRODUCT_AB: 2449 m = A->rmap->n; 2450 n = B->cmap->n; 2451 k = A->cmap->n; 2452 Amat = Acusp->mat; 2453 Bmat = Bcusp->mat; 2454 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2455 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2456 break; 2457 case MATPRODUCT_AtB: 2458 m = A->cmap->n; 2459 n = B->cmap->n; 2460 k = A->rmap->n; 2461 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2462 Amat = Acusp->matTranspose; 2463 Bmat = Bcusp->mat; 2464 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2465 break; 2466 case MATPRODUCT_ABt: 2467 m = A->rmap->n; 2468 n = B->rmap->n; 2469 k = A->cmap->n; 2470 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 2471 Amat = Acusp->mat; 2472 Bmat = Bcusp->matTranspose; 2473 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2474 break; 2475 default: 2476 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2477 } 2478 2479 /* create cusparse matrix */ 2480 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2481 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2482 c = (Mat_SeqAIJ*)C->data; 2483 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2484 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2485 Ccsr = new CsrMatrix; 2486 2487 c->compressedrow.use = ciscompressed; 2488 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2489 c->compressedrow.nrows = a->compressedrow.nrows; 2490 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2491 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2492 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2493 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2494 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2495 } else { 2496 c->compressedrow.nrows = 0; 2497 c->compressedrow.i = NULL; 2498 c->compressedrow.rindex = NULL; 2499 Ccusp->workVector = NULL; 2500 Cmat->cprowIndices = NULL; 2501 } 2502 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2503 Ccusp->mat = Cmat; 2504 Ccusp->mat->mat = Ccsr; 2505 Ccsr->num_rows = Ccusp->nrows; 2506 Ccsr->num_cols = n; 2507 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2508 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2509 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2510 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2511 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2512 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2513 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2514 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2515 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2516 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2517 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2518 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2519 c->nz = 0; 2520 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2521 Ccsr->values = new THRUSTARRAY(c->nz); 2522 goto finalizesym; 2523 } 2524 2525 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2526 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2527 Acsr = (CsrMatrix*)Amat->mat; 2528 if (!biscompressed) { 2529 Bcsr = (CsrMatrix*)Bmat->mat; 2530 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2531 BmatSpDescr = Bmat->matDescr; 2532 #endif 2533 } else { /* we need to use row offsets for the full matrix */ 2534 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2535 Bcsr = new CsrMatrix; 2536 Bcsr->num_rows = B->rmap->n; 2537 Bcsr->num_cols = cBcsr->num_cols; 2538 Bcsr->num_entries = cBcsr->num_entries; 2539 Bcsr->column_indices = cBcsr->column_indices; 2540 Bcsr->values = cBcsr->values; 2541 if (!Bcusp->rowoffsets_gpu) { 2542 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2543 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2544 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2545 } 2546 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2547 mmdata->Bcsr = Bcsr; 2548 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2549 if (Bcsr->num_rows && Bcsr->num_cols) { 2550 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2551 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2552 Bcsr->values->data().get(), 2553 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2554 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2555 } 2556 BmatSpDescr = mmdata->matSpBDescr; 2557 #endif 2558 } 2559 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2560 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2561 /* precompute flops count */ 2562 if (ptype == MATPRODUCT_AB) { 2563 for (i=0, flops = 0; i<A->rmap->n; i++) { 2564 const PetscInt st = a->i[i]; 2565 const PetscInt en = a->i[i+1]; 2566 for (j=st; j<en; j++) { 2567 const PetscInt brow = a->j[j]; 2568 flops += 2.*(b->i[brow+1] - b->i[brow]); 2569 } 2570 } 2571 } else if (ptype == MATPRODUCT_AtB) { 2572 for (i=0, flops = 0; i<A->rmap->n; i++) { 2573 const PetscInt anzi = a->i[i+1] - a->i[i]; 2574 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2575 flops += (2.*anzi)*bnzi; 2576 } 2577 } else { /* TODO */ 2578 flops = 0.; 2579 } 2580 2581 mmdata->flops = flops; 2582 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2583 2584 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2585 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2586 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2587 NULL, NULL, NULL, 2588 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2589 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2590 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2591 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2592 { 2593 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2594 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2595 */ 2596 void* dBuffer1 = NULL; 2597 void* dBuffer2 = NULL; 2598 void* dBuffer3 = NULL; 2599 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2600 size_t bufferSize1 = 0; 2601 size_t bufferSize2 = 0; 2602 size_t bufferSize3 = 0; 2603 size_t bufferSize4 = 0; 2604 size_t bufferSize5 = 0; 2605 2606 /*----------------------------------------------------------------------*/ 2607 /* ask bufferSize1 bytes for external memory */ 2608 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2609 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2610 &bufferSize1, NULL);CHKERRCUSPARSE(stat); 2611 cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr); 2612 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2613 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2614 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2615 &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat); 2616 2617 /*----------------------------------------------------------------------*/ 2618 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2619 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2620 &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat); 2621 cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr); 2622 cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr); 2623 cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr); 2624 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2625 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2626 &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat); 2627 cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr); 2628 cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr); 2629 2630 /*----------------------------------------------------------------------*/ 2631 /* get matrix C non-zero entries C_nnz1 */ 2632 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2633 c->nz = (PetscInt) C_nnz1; 2634 /* allocate matrix C */ 2635 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2636 Ccsr->values = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2637 /* update matC with the new pointers */ 2638 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2639 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2640 2641 /*----------------------------------------------------------------------*/ 2642 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2643 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2644 &bufferSize5, NULL);CHKERRCUSPARSE(stat); 2645 cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr); 2646 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2647 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2648 &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat); 2649 cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr); 2650 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2651 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2652 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2653 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2654 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr); 2655 } 2656 #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2657 size_t bufSize2; 2658 /* ask bufferSize bytes for external memory */ 2659 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2660 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2661 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2662 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2663 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2664 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2665 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2666 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2667 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2668 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2669 /* ask bufferSize again bytes for external memory */ 2670 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2671 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2672 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2673 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2674 /* The CUSPARSE documentation is not clear, nor the API 2675 We need both buffers to perform the operations properly! 2676 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2677 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2678 is stored in the descriptor! What a messy API... */ 2679 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2680 /* compute the intermediate product of A * B */ 2681 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2682 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2683 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2684 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2685 /* get matrix C non-zero entries C_nnz1 */ 2686 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2687 c->nz = (PetscInt) C_nnz1; 2688 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2689 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2690 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2691 Ccsr->values = new THRUSTARRAY(c->nz); 2692 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2693 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2694 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2695 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2696 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2697 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2698 #endif 2699 #else 2700 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2701 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2702 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2703 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2704 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2705 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2706 c->nz = cnz; 2707 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2708 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2709 Ccsr->values = new THRUSTARRAY(c->nz); 2710 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2711 2712 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2713 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2714 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2715 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2716 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2717 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2718 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2719 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2720 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2721 #endif 2722 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2723 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2724 finalizesym: 2725 c->singlemalloc = PETSC_FALSE; 2726 c->free_a = PETSC_TRUE; 2727 c->free_ij = PETSC_TRUE; 2728 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2729 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2730 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2731 PetscInt *d_i = c->i; 2732 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2733 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2734 ii = *Ccsr->row_offsets; 2735 jj = *Ccsr->column_indices; 2736 if (ciscompressed) d_i = c->compressedrow.i; 2737 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2738 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2739 } else { 2740 PetscInt *d_i = c->i; 2741 if (ciscompressed) d_i = c->compressedrow.i; 2742 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2743 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2744 } 2745 if (ciscompressed) { /* need to expand host row offsets */ 2746 PetscInt r = 0; 2747 c->i[0] = 0; 2748 for (k = 0; k < c->compressedrow.nrows; k++) { 2749 const PetscInt next = c->compressedrow.rindex[k]; 2750 const PetscInt old = c->compressedrow.i[k]; 2751 for (; r < next; r++) c->i[r+1] = old; 2752 } 2753 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2754 } 2755 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2756 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2757 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2758 c->maxnz = c->nz; 2759 c->nonzerorowcnt = 0; 2760 c->rmax = 0; 2761 for (k = 0; k < m; k++) { 2762 const PetscInt nn = c->i[k+1] - c->i[k]; 2763 c->ilen[k] = c->imax[k] = nn; 2764 c->nonzerorowcnt += (PetscInt)!!nn; 2765 c->rmax = PetscMax(c->rmax,nn); 2766 } 2767 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2768 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2769 Ccsr->num_entries = c->nz; 2770 2771 C->nonzerostate++; 2772 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2773 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2774 Ccusp->nonzerostate = C->nonzerostate; 2775 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2776 C->preallocated = PETSC_TRUE; 2777 C->assembled = PETSC_FALSE; 2778 C->was_assembled = PETSC_FALSE; 2779 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2780 mmdata->reusesym = PETSC_TRUE; 2781 C->offloadmask = PETSC_OFFLOAD_GPU; 2782 } 2783 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2784 PetscFunctionReturn(0); 2785 } 2786 2787 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2788 2789 /* handles sparse or dense B */ 2790 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2791 { 2792 Mat_Product *product = mat->product; 2793 PetscErrorCode ierr; 2794 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2795 2796 PetscFunctionBegin; 2797 MatCheckProduct(mat,1); 2798 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2799 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2800 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2801 } 2802 if (product->type == MATPRODUCT_ABC) { 2803 Ciscusp = PETSC_FALSE; 2804 if (!product->C->boundtocpu) { 2805 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2806 } 2807 } 2808 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 2809 PetscBool usecpu = PETSC_FALSE; 2810 switch (product->type) { 2811 case MATPRODUCT_AB: 2812 if (product->api_user) { 2813 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 2814 ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2815 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2816 } else { 2817 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 2818 ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2819 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2820 } 2821 break; 2822 case MATPRODUCT_AtB: 2823 if (product->api_user) { 2824 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 2825 ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2826 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2827 } else { 2828 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 2829 ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2830 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2831 } 2832 break; 2833 case MATPRODUCT_PtAP: 2834 if (product->api_user) { 2835 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 2836 ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2837 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2838 } else { 2839 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 2840 ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2841 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2842 } 2843 break; 2844 case MATPRODUCT_RARt: 2845 if (product->api_user) { 2846 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 2847 ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2848 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2849 } else { 2850 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 2851 ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2852 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2853 } 2854 break; 2855 case MATPRODUCT_ABC: 2856 if (product->api_user) { 2857 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 2858 ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2859 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2860 } else { 2861 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 2862 ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2863 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2864 } 2865 break; 2866 default: 2867 break; 2868 } 2869 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 2870 } 2871 /* dispatch */ 2872 if (isdense) { 2873 switch (product->type) { 2874 case MATPRODUCT_AB: 2875 case MATPRODUCT_AtB: 2876 case MATPRODUCT_ABt: 2877 case MATPRODUCT_PtAP: 2878 case MATPRODUCT_RARt: 2879 if (product->A->boundtocpu) { 2880 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2881 } else { 2882 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2883 } 2884 break; 2885 case MATPRODUCT_ABC: 2886 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2887 break; 2888 default: 2889 break; 2890 } 2891 } else if (Biscusp && Ciscusp) { 2892 switch (product->type) { 2893 case MATPRODUCT_AB: 2894 case MATPRODUCT_AtB: 2895 case MATPRODUCT_ABt: 2896 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2897 break; 2898 case MATPRODUCT_PtAP: 2899 case MATPRODUCT_RARt: 2900 case MATPRODUCT_ABC: 2901 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2902 break; 2903 default: 2904 break; 2905 } 2906 } else { /* fallback for AIJ */ 2907 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2908 } 2909 PetscFunctionReturn(0); 2910 } 2911 2912 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2913 { 2914 PetscErrorCode ierr; 2915 2916 PetscFunctionBegin; 2917 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2918 PetscFunctionReturn(0); 2919 } 2920 2921 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2922 { 2923 PetscErrorCode ierr; 2924 2925 PetscFunctionBegin; 2926 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2927 PetscFunctionReturn(0); 2928 } 2929 2930 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2931 { 2932 PetscErrorCode ierr; 2933 2934 PetscFunctionBegin; 2935 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2936 PetscFunctionReturn(0); 2937 } 2938 2939 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2940 { 2941 PetscErrorCode ierr; 2942 2943 PetscFunctionBegin; 2944 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2945 PetscFunctionReturn(0); 2946 } 2947 2948 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2949 { 2950 PetscErrorCode ierr; 2951 2952 PetscFunctionBegin; 2953 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2954 PetscFunctionReturn(0); 2955 } 2956 2957 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2958 { 2959 int i = blockIdx.x*blockDim.x + threadIdx.x; 2960 if (i < n) y[idx[i]] += x[i]; 2961 } 2962 2963 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2964 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2965 { 2966 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2967 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2968 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2969 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2970 PetscErrorCode ierr; 2971 cusparseStatus_t stat; 2972 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2973 PetscBool compressed; 2974 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2975 PetscInt nx,ny; 2976 #endif 2977 2978 PetscFunctionBegin; 2979 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 2980 if (!a->nonzerorowcnt) { 2981 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2982 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2983 PetscFunctionReturn(0); 2984 } 2985 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2986 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2987 if (!trans) { 2988 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2989 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2990 } else { 2991 if (herm || !A->form_explicit_transpose) { 2992 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2993 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2994 } else { 2995 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);} 2996 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2997 } 2998 } 2999 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3000 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3001 3002 try { 3003 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3004 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 3005 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 3006 3007 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3008 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3009 /* z = A x + beta y. 3010 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3011 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3012 */ 3013 xptr = xarray; 3014 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3015 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3016 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3017 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3018 allocated to accommodate different uses. So we get the length info directly from mat. 3019 */ 3020 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3021 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3022 nx = mat->num_cols; 3023 ny = mat->num_rows; 3024 } 3025 #endif 3026 } else { 3027 /* z = A^T x + beta y 3028 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3029 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3030 */ 3031 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3032 dptr = zarray; 3033 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3034 if (compressed) { /* Scatter x to work vector */ 3035 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3036 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3037 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3038 VecCUDAEqualsReverse()); 3039 } 3040 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3041 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3042 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3043 nx = mat->num_rows; 3044 ny = mat->num_cols; 3045 } 3046 #endif 3047 } 3048 3049 /* csr_spmv does y = alpha op(A) x + beta y */ 3050 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3051 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3052 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3053 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3054 cudaError_t cerr; 3055 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3056 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3057 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3058 matstruct->matDescr, 3059 matstruct->cuSpMV[opA].vecXDescr, beta, 3060 matstruct->cuSpMV[opA].vecYDescr, 3061 cusparse_scalartype, 3062 cusparsestruct->spmvAlg, 3063 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 3064 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 3065 3066 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3067 } else { 3068 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3069 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 3070 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 3071 } 3072 3073 stat = cusparseSpMV(cusparsestruct->handle, opA, 3074 matstruct->alpha_one, 3075 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3076 matstruct->cuSpMV[opA].vecXDescr, 3077 beta, 3078 matstruct->cuSpMV[opA].vecYDescr, 3079 cusparse_scalartype, 3080 cusparsestruct->spmvAlg, 3081 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 3082 #else 3083 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3084 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 3085 mat->num_rows, mat->num_cols, 3086 mat->num_entries, matstruct->alpha_one, matstruct->descr, 3087 mat->values->data().get(), mat->row_offsets->data().get(), 3088 mat->column_indices->data().get(), xptr, beta, 3089 dptr);CHKERRCUSPARSE(stat); 3090 #endif 3091 } else { 3092 if (cusparsestruct->nrows) { 3093 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3094 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3095 #else 3096 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3097 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 3098 matstruct->alpha_one, matstruct->descr, hybMat, 3099 xptr, beta, 3100 dptr);CHKERRCUSPARSE(stat); 3101 #endif 3102 } 3103 } 3104 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3105 3106 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3107 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3108 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3109 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 3110 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3111 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3112 } 3113 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3114 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 3115 } 3116 3117 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3118 if (compressed) { 3119 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3120 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3121 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3122 prevent that. So I just add a ScatterAdd kernel. 3123 */ 3124 #if 0 3125 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3126 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3127 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3128 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3129 VecCUDAPlusEquals()); 3130 #else 3131 PetscInt n = matstruct->cprowIndices->size(); 3132 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3133 #endif 3134 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3135 } 3136 } else { 3137 if (yy && yy != zz) { 3138 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3139 } 3140 } 3141 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3142 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 3143 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 3144 } catch(char *ex) { 3145 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3146 } 3147 if (yy) { 3148 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 3149 } else { 3150 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 3151 } 3152 PetscFunctionReturn(0); 3153 } 3154 3155 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3156 { 3157 PetscErrorCode ierr; 3158 3159 PetscFunctionBegin; 3160 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3161 PetscFunctionReturn(0); 3162 } 3163 3164 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3165 { 3166 PetscErrorCode ierr; 3167 PetscObjectState onnz = A->nonzerostate; 3168 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3169 3170 PetscFunctionBegin; 3171 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 3172 if (onnz != A->nonzerostate && cusp->deviceMat) { 3173 cudaError_t cerr; 3174 3175 ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 3176 cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr); 3177 cusp->deviceMat = NULL; 3178 } 3179 PetscFunctionReturn(0); 3180 } 3181 3182 /* --------------------------------------------------------------------------------*/ 3183 /*@ 3184 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3185 (the default parallel PETSc format). This matrix will ultimately pushed down 3186 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3187 assembly performance the user should preallocate the matrix storage by setting 3188 the parameter nz (or the array nnz). By setting these parameters accurately, 3189 performance during matrix assembly can be increased by more than a factor of 50. 3190 3191 Collective 3192 3193 Input Parameters: 3194 + comm - MPI communicator, set to PETSC_COMM_SELF 3195 . m - number of rows 3196 . n - number of columns 3197 . nz - number of nonzeros per row (same for all rows) 3198 - nnz - array containing the number of nonzeros in the various rows 3199 (possibly different for each row) or NULL 3200 3201 Output Parameter: 3202 . A - the matrix 3203 3204 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3205 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3206 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3207 3208 Notes: 3209 If nnz is given then nz is ignored 3210 3211 The AIJ format (also called the Yale sparse matrix format or 3212 compressed row storage), is fully compatible with standard Fortran 77 3213 storage. That is, the stored row and column indices can begin at 3214 either one (as in Fortran) or zero. See the users' manual for details. 3215 3216 Specify the preallocated storage with either nz or nnz (not both). 3217 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3218 allocation. For large problems you MUST preallocate memory or you 3219 will get TERRIBLE performance, see the users' manual chapter on matrices. 3220 3221 By default, this format uses inodes (identical nodes) when possible, to 3222 improve numerical efficiency of matrix-vector products and solves. We 3223 search for consecutive rows with the same nonzero structure, thereby 3224 reusing matrix information to achieve increased efficiency. 3225 3226 Level: intermediate 3227 3228 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3229 @*/ 3230 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3231 { 3232 PetscErrorCode ierr; 3233 3234 PetscFunctionBegin; 3235 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3236 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3237 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3238 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3239 PetscFunctionReturn(0); 3240 } 3241 3242 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3243 { 3244 PetscErrorCode ierr; 3245 3246 PetscFunctionBegin; 3247 if (A->factortype == MAT_FACTOR_NONE) { 3248 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3249 } else { 3250 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3251 } 3252 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3253 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3254 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3255 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3256 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3257 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3258 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3259 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3260 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 3261 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3262 PetscFunctionReturn(0); 3263 } 3264 3265 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3266 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3267 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3268 { 3269 PetscErrorCode ierr; 3270 3271 PetscFunctionBegin; 3272 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3273 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3274 PetscFunctionReturn(0); 3275 } 3276 3277 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3278 { 3279 PetscErrorCode ierr; 3280 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3281 Mat_SeqAIJCUSPARSE *cy; 3282 Mat_SeqAIJCUSPARSE *cx; 3283 PetscScalar *ay; 3284 const PetscScalar *ax; 3285 CsrMatrix *csry,*csrx; 3286 3287 PetscFunctionBegin; 3288 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3289 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3290 if (X->ops->axpy != Y->ops->axpy) { 3291 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3292 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3293 PetscFunctionReturn(0); 3294 } 3295 /* if we are here, it means both matrices are bound to GPU */ 3296 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3297 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3298 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3299 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3300 csry = (CsrMatrix*)cy->mat->mat; 3301 csrx = (CsrMatrix*)cx->mat->mat; 3302 /* see if we can turn this into a cublas axpy */ 3303 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3304 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3305 if (eq) { 3306 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3307 } 3308 if (eq) str = SAME_NONZERO_PATTERN; 3309 } 3310 /* spgeam is buggy with one column */ 3311 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3312 3313 if (str == SUBSET_NONZERO_PATTERN) { 3314 cusparseStatus_t stat; 3315 PetscScalar b = 1.0; 3316 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3317 size_t bufferSize; 3318 void *buffer; 3319 cudaError_t cerr; 3320 #endif 3321 3322 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3323 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3324 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3325 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3326 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3327 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3328 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3329 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3330 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3331 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3332 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3333 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3334 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3335 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3336 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3337 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3338 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3339 #else 3340 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3341 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3342 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3343 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3344 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3345 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3346 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3347 #endif 3348 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3349 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3350 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3351 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3352 } else if (str == SAME_NONZERO_PATTERN) { 3353 cublasHandle_t cublasv2handle; 3354 cublasStatus_t berr; 3355 PetscBLASInt one = 1, bnz = 1; 3356 3357 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3358 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3359 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3360 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3361 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3362 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3363 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3364 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3365 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3366 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3367 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3368 } else { 3369 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3370 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3371 } 3372 PetscFunctionReturn(0); 3373 } 3374 3375 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3376 { 3377 PetscErrorCode ierr; 3378 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3379 PetscScalar *ay; 3380 cublasHandle_t cublasv2handle; 3381 cublasStatus_t berr; 3382 PetscBLASInt one = 1, bnz = 1; 3383 3384 PetscFunctionBegin; 3385 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3386 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3387 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3388 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3389 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3390 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3391 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3392 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3393 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3394 PetscFunctionReturn(0); 3395 } 3396 3397 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3398 { 3399 PetscErrorCode ierr; 3400 PetscBool both = PETSC_FALSE; 3401 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3402 3403 PetscFunctionBegin; 3404 if (A->factortype == MAT_FACTOR_NONE) { 3405 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3406 if (spptr->mat) { 3407 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3408 if (matrix->values) { 3409 both = PETSC_TRUE; 3410 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3411 } 3412 } 3413 if (spptr->matTranspose) { 3414 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3415 if (matrix->values) { 3416 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3417 } 3418 } 3419 } 3420 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3421 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3422 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3423 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3424 else A->offloadmask = PETSC_OFFLOAD_CPU; 3425 PetscFunctionReturn(0); 3426 } 3427 3428 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3429 { 3430 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3431 PetscErrorCode ierr; 3432 3433 PetscFunctionBegin; 3434 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3435 if (flg) { 3436 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3437 3438 A->ops->scale = MatScale_SeqAIJ; 3439 A->ops->axpy = MatAXPY_SeqAIJ; 3440 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3441 A->ops->mult = MatMult_SeqAIJ; 3442 A->ops->multadd = MatMultAdd_SeqAIJ; 3443 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3444 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3445 A->ops->multhermitiantranspose = NULL; 3446 A->ops->multhermitiantransposeadd = NULL; 3447 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3448 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3449 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3450 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3451 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3452 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3453 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3454 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3455 } else { 3456 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3457 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3458 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3459 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3460 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3461 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3462 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3463 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3464 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3465 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3466 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3467 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3468 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3469 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3470 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3471 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3472 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3473 } 3474 A->boundtocpu = flg; 3475 a->inode.use = flg; 3476 PetscFunctionReturn(0); 3477 } 3478 3479 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3480 { 3481 PetscErrorCode ierr; 3482 cusparseStatus_t stat; 3483 Mat B; 3484 3485 PetscFunctionBegin; 3486 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3487 if (reuse == MAT_INITIAL_MATRIX) { 3488 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3489 } else if (reuse == MAT_REUSE_MATRIX) { 3490 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3491 } 3492 B = *newmat; 3493 3494 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3495 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3496 3497 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3498 if (B->factortype == MAT_FACTOR_NONE) { 3499 Mat_SeqAIJCUSPARSE *spptr; 3500 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3501 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3502 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3503 spptr->format = MAT_CUSPARSE_CSR; 3504 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3505 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3506 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3507 #else 3508 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3509 #endif 3510 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3511 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3512 #endif 3513 B->spptr = spptr; 3514 } else { 3515 Mat_SeqAIJCUSPARSETriFactors *spptr; 3516 3517 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3518 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3519 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3520 B->spptr = spptr; 3521 } 3522 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3523 } 3524 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3525 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3526 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3527 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3528 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3529 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3530 3531 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3532 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3533 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3534 #if defined(PETSC_HAVE_HYPRE) 3535 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 3536 #endif 3537 PetscFunctionReturn(0); 3538 } 3539 3540 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3541 { 3542 PetscErrorCode ierr; 3543 3544 PetscFunctionBegin; 3545 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3546 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3547 PetscFunctionReturn(0); 3548 } 3549 3550 /*MC 3551 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3552 3553 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3554 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3555 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3556 3557 Options Database Keys: 3558 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3559 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3560 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3561 3562 Level: beginner 3563 3564 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3565 M*/ 3566 3567 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3568 3569 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3570 { 3571 PetscErrorCode ierr; 3572 3573 PetscFunctionBegin; 3574 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 3575 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3576 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3577 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3578 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3579 3580 PetscFunctionReturn(0); 3581 } 3582 3583 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3584 { 3585 PetscErrorCode ierr; 3586 cusparseStatus_t stat; 3587 3588 PetscFunctionBegin; 3589 if (*cusparsestruct) { 3590 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3591 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3592 delete (*cusparsestruct)->workVector; 3593 delete (*cusparsestruct)->rowoffsets_gpu; 3594 delete (*cusparsestruct)->cooPerm; 3595 delete (*cusparsestruct)->cooPerm_a; 3596 delete (*cusparsestruct)->csr2csc_i; 3597 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3598 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3599 } 3600 PetscFunctionReturn(0); 3601 } 3602 3603 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3604 { 3605 PetscFunctionBegin; 3606 if (*mat) { 3607 delete (*mat)->values; 3608 delete (*mat)->column_indices; 3609 delete (*mat)->row_offsets; 3610 delete *mat; 3611 *mat = 0; 3612 } 3613 PetscFunctionReturn(0); 3614 } 3615 3616 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3617 { 3618 cusparseStatus_t stat; 3619 PetscErrorCode ierr; 3620 3621 PetscFunctionBegin; 3622 if (*trifactor) { 3623 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3624 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3625 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3626 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3627 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3628 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3629 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3630 #endif 3631 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3632 } 3633 PetscFunctionReturn(0); 3634 } 3635 3636 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3637 { 3638 CsrMatrix *mat; 3639 cusparseStatus_t stat; 3640 cudaError_t err; 3641 3642 PetscFunctionBegin; 3643 if (*matstruct) { 3644 if ((*matstruct)->mat) { 3645 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3646 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3647 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3648 #else 3649 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3650 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3651 #endif 3652 } else { 3653 mat = (CsrMatrix*)(*matstruct)->mat; 3654 CsrMatrix_Destroy(&mat); 3655 } 3656 } 3657 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3658 delete (*matstruct)->cprowIndices; 3659 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3660 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3661 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3662 3663 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3664 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3665 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3666 for (int i=0; i<3; i++) { 3667 if (mdata->cuSpMV[i].initialized) { 3668 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3669 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3670 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3671 } 3672 } 3673 #endif 3674 delete *matstruct; 3675 *matstruct = NULL; 3676 } 3677 PetscFunctionReturn(0); 3678 } 3679 3680 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3681 { 3682 PetscErrorCode ierr; 3683 3684 PetscFunctionBegin; 3685 if (*trifactors) { 3686 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3687 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3688 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3689 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3690 delete (*trifactors)->rpermIndices; 3691 delete (*trifactors)->cpermIndices; 3692 delete (*trifactors)->workVector; 3693 (*trifactors)->rpermIndices = NULL; 3694 (*trifactors)->cpermIndices = NULL; 3695 (*trifactors)->workVector = NULL; 3696 if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3697 if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3698 (*trifactors)->init_dev_prop = PETSC_FALSE; 3699 } 3700 PetscFunctionReturn(0); 3701 } 3702 3703 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3704 { 3705 PetscErrorCode ierr; 3706 cusparseHandle_t handle; 3707 cusparseStatus_t stat; 3708 3709 PetscFunctionBegin; 3710 if (*trifactors) { 3711 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3712 if (handle = (*trifactors)->handle) { 3713 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3714 } 3715 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3716 } 3717 PetscFunctionReturn(0); 3718 } 3719 3720 struct IJCompare 3721 { 3722 __host__ __device__ 3723 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3724 { 3725 if (t1.get<0>() < t2.get<0>()) return true; 3726 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3727 return false; 3728 } 3729 }; 3730 3731 struct IJEqual 3732 { 3733 __host__ __device__ 3734 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3735 { 3736 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3737 return true; 3738 } 3739 }; 3740 3741 struct IJDiff 3742 { 3743 __host__ __device__ 3744 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3745 { 3746 return t1 == t2 ? 0 : 1; 3747 } 3748 }; 3749 3750 struct IJSum 3751 { 3752 __host__ __device__ 3753 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3754 { 3755 return t1||t2; 3756 } 3757 }; 3758 3759 #include <thrust/iterator/discard_iterator.h> 3760 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3761 { 3762 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3763 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3764 THRUSTARRAY *cooPerm_v = NULL; 3765 thrust::device_ptr<const PetscScalar> d_v; 3766 CsrMatrix *matrix; 3767 PetscErrorCode ierr; 3768 PetscInt n; 3769 3770 PetscFunctionBegin; 3771 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3772 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3773 if (!cusp->cooPerm) { 3774 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3775 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3776 PetscFunctionReturn(0); 3777 } 3778 matrix = (CsrMatrix*)cusp->mat->mat; 3779 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3780 if (!v) { 3781 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3782 goto finalize; 3783 } 3784 n = cusp->cooPerm->size(); 3785 if (isCudaMem(v)) { 3786 d_v = thrust::device_pointer_cast(v); 3787 } else { 3788 cooPerm_v = new THRUSTARRAY(n); 3789 cooPerm_v->assign(v,v+n); 3790 d_v = cooPerm_v->data(); 3791 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3792 } 3793 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3794 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3795 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3796 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3797 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3798 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3799 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3800 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3801 */ 3802 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3803 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3804 delete cooPerm_w; 3805 } else { 3806 /* all nonzeros in d_v[] are unique entries */ 3807 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3808 matrix->values->begin())); 3809 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3810 matrix->values->end())); 3811 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 3812 } 3813 } else { 3814 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3815 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3816 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3817 } else { 3818 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3819 matrix->values->begin())); 3820 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3821 matrix->values->end())); 3822 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3823 } 3824 } 3825 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3826 finalize: 3827 delete cooPerm_v; 3828 A->offloadmask = PETSC_OFFLOAD_GPU; 3829 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3830 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3831 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3832 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3833 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3834 a->reallocs = 0; 3835 A->info.mallocs += 0; 3836 A->info.nz_unneeded = 0; 3837 A->assembled = A->was_assembled = PETSC_TRUE; 3838 A->num_ass++; 3839 PetscFunctionReturn(0); 3840 } 3841 3842 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3843 { 3844 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3845 PetscErrorCode ierr; 3846 3847 PetscFunctionBegin; 3848 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3849 if (!cusp) PetscFunctionReturn(0); 3850 if (destroy) { 3851 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3852 delete cusp->csr2csc_i; 3853 cusp->csr2csc_i = NULL; 3854 } 3855 A->transupdated = PETSC_FALSE; 3856 PetscFunctionReturn(0); 3857 } 3858 3859 #include <thrust/binary_search.h> 3860 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3861 { 3862 PetscErrorCode ierr; 3863 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3864 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3865 PetscInt cooPerm_n, nzr = 0; 3866 cudaError_t cerr; 3867 3868 PetscFunctionBegin; 3869 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3870 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3871 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3872 if (n != cooPerm_n) { 3873 delete cusp->cooPerm; 3874 delete cusp->cooPerm_a; 3875 cusp->cooPerm = NULL; 3876 cusp->cooPerm_a = NULL; 3877 } 3878 if (n) { 3879 THRUSTINTARRAY d_i(n); 3880 THRUSTINTARRAY d_j(n); 3881 THRUSTINTARRAY ii(A->rmap->n); 3882 3883 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3884 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3885 3886 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3887 d_i.assign(coo_i,coo_i+n); 3888 d_j.assign(coo_j,coo_j+n); 3889 3890 /* Ex. 3891 n = 6 3892 coo_i = [3,3,1,4,1,4] 3893 coo_j = [3,2,2,5,2,6] 3894 */ 3895 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3896 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3897 3898 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3899 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3900 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 3901 *cusp->cooPerm_a = d_i; /* copy the sorted array */ 3902 THRUSTINTARRAY w = d_j; 3903 3904 /* 3905 d_i = [1,1,3,3,4,4] 3906 d_j = [2,2,2,3,5,6] 3907 cooPerm = [2,4,1,0,3,5] 3908 */ 3909 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 3910 3911 /* 3912 d_i = [1,3,3,4,4,x] 3913 ^ekey 3914 d_j = [2,2,3,5,6,x] 3915 ^nekye 3916 */ 3917 if (nekey == ekey) { /* all entries are unique */ 3918 delete cusp->cooPerm_a; 3919 cusp->cooPerm_a = NULL; 3920 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 3921 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 3922 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 3923 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 3924 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 3925 w[0] = 0; 3926 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 3927 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 3928 } 3929 thrust::counting_iterator<PetscInt> search_begin(0); 3930 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 3931 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 3932 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 3933 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3934 3935 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3936 a->singlemalloc = PETSC_FALSE; 3937 a->free_a = PETSC_TRUE; 3938 a->free_ij = PETSC_TRUE; 3939 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3940 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 3941 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3942 a->nz = a->maxnz = a->i[A->rmap->n]; 3943 a->rmax = 0; 3944 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3945 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3946 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3947 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3948 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3949 for (PetscInt i = 0; i < A->rmap->n; i++) { 3950 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3951 nzr += (PetscInt)!!(nnzr); 3952 a->ilen[i] = a->imax[i] = nnzr; 3953 a->rmax = PetscMax(a->rmax,nnzr); 3954 } 3955 a->nonzerorowcnt = nzr; 3956 A->preallocated = PETSC_TRUE; 3957 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3958 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3959 } else { 3960 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3961 } 3962 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3963 3964 /* We want to allocate the CUSPARSE struct for matvec now. 3965 The code is so convoluted now that I prefer to copy zeros */ 3966 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3967 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3968 A->offloadmask = PETSC_OFFLOAD_CPU; 3969 A->nonzerostate++; 3970 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3971 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 3972 3973 A->assembled = PETSC_FALSE; 3974 A->was_assembled = PETSC_FALSE; 3975 PetscFunctionReturn(0); 3976 } 3977 3978 /*@C 3979 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 3980 3981 Not collective 3982 3983 Input Parameters: 3984 + A - the matrix 3985 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 3986 3987 Output Parameters: 3988 + ia - the CSR row pointers 3989 - ja - the CSR column indices 3990 3991 Level: developer 3992 3993 Notes: 3994 When compressed is true, the CSR structure does not contain empty rows 3995 3996 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead() 3997 @*/ 3998 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 3999 { 4000 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4001 CsrMatrix *csr; 4002 PetscErrorCode ierr; 4003 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4004 4005 PetscFunctionBegin; 4006 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4007 if (!i || !j) PetscFunctionReturn(0); 4008 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4009 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4010 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4011 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4012 csr = (CsrMatrix*)cusp->mat->mat; 4013 if (i) { 4014 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4015 if (!cusp->rowoffsets_gpu) { 4016 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4017 cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4018 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4019 } 4020 *i = cusp->rowoffsets_gpu->data().get(); 4021 } else *i = csr->row_offsets->data().get(); 4022 } 4023 if (j) *j = csr->column_indices->data().get(); 4024 PetscFunctionReturn(0); 4025 } 4026 4027 /*@C 4028 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 4029 4030 Not collective 4031 4032 Input Parameters: 4033 + A - the matrix 4034 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4035 4036 Output Parameters: 4037 + ia - the CSR row pointers 4038 - ja - the CSR column indices 4039 4040 Level: developer 4041 4042 .seealso: MatSeqAIJCUSPARSEGetIJ() 4043 @*/ 4044 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4045 { 4046 PetscFunctionBegin; 4047 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4048 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4049 if (i) *i = NULL; 4050 if (j) *j = NULL; 4051 PetscFunctionReturn(0); 4052 } 4053 4054 /*@C 4055 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4056 4057 Not Collective 4058 4059 Input Parameter: 4060 . A - a MATSEQAIJCUSPARSE matrix 4061 4062 Output Parameter: 4063 . a - pointer to the device data 4064 4065 Level: developer 4066 4067 Notes: may trigger host-device copies if up-to-date matrix data is on host 4068 4069 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead() 4070 @*/ 4071 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4072 { 4073 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4074 CsrMatrix *csr; 4075 PetscErrorCode ierr; 4076 4077 PetscFunctionBegin; 4078 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4079 PetscValidPointer(a,2); 4080 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4081 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4082 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4083 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4084 csr = (CsrMatrix*)cusp->mat->mat; 4085 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4086 *a = csr->values->data().get(); 4087 PetscFunctionReturn(0); 4088 } 4089 4090 /*@C 4091 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 4092 4093 Not Collective 4094 4095 Input Parameter: 4096 . A - a MATSEQAIJCUSPARSE matrix 4097 4098 Output Parameter: 4099 . a - pointer to the device data 4100 4101 Level: developer 4102 4103 .seealso: MatSeqAIJCUSPARSEGetArrayRead() 4104 @*/ 4105 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4106 { 4107 PetscFunctionBegin; 4108 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4109 PetscValidPointer(a,2); 4110 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4111 *a = NULL; 4112 PetscFunctionReturn(0); 4113 } 4114 4115 /*@C 4116 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4117 4118 Not Collective 4119 4120 Input Parameter: 4121 . A - a MATSEQAIJCUSPARSE matrix 4122 4123 Output Parameter: 4124 . a - pointer to the device data 4125 4126 Level: developer 4127 4128 Notes: may trigger host-device copies if up-to-date matrix data is on host 4129 4130 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray() 4131 @*/ 4132 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4133 { 4134 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4135 CsrMatrix *csr; 4136 PetscErrorCode ierr; 4137 4138 PetscFunctionBegin; 4139 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4140 PetscValidPointer(a,2); 4141 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4142 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4143 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4144 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4145 csr = (CsrMatrix*)cusp->mat->mat; 4146 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4147 *a = csr->values->data().get(); 4148 A->offloadmask = PETSC_OFFLOAD_GPU; 4149 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4150 PetscFunctionReturn(0); 4151 } 4152 /*@C 4153 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4154 4155 Not Collective 4156 4157 Input Parameter: 4158 . A - a MATSEQAIJCUSPARSE matrix 4159 4160 Output Parameter: 4161 . a - pointer to the device data 4162 4163 Level: developer 4164 4165 .seealso: MatSeqAIJCUSPARSEGetArray() 4166 @*/ 4167 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4168 { 4169 PetscErrorCode ierr; 4170 4171 PetscFunctionBegin; 4172 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4173 PetscValidPointer(a,2); 4174 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4175 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4176 *a = NULL; 4177 PetscFunctionReturn(0); 4178 } 4179 4180 /*@C 4181 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4182 4183 Not Collective 4184 4185 Input Parameter: 4186 . A - a MATSEQAIJCUSPARSE matrix 4187 4188 Output Parameter: 4189 . a - pointer to the device data 4190 4191 Level: developer 4192 4193 Notes: does not trigger host-device copies and flags data validity on the GPU 4194 4195 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite() 4196 @*/ 4197 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4198 { 4199 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4200 CsrMatrix *csr; 4201 PetscErrorCode ierr; 4202 4203 PetscFunctionBegin; 4204 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4205 PetscValidPointer(a,2); 4206 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4207 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4208 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4209 csr = (CsrMatrix*)cusp->mat->mat; 4210 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4211 *a = csr->values->data().get(); 4212 A->offloadmask = PETSC_OFFLOAD_GPU; 4213 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4214 PetscFunctionReturn(0); 4215 } 4216 4217 /*@C 4218 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 4219 4220 Not Collective 4221 4222 Input Parameter: 4223 . A - a MATSEQAIJCUSPARSE matrix 4224 4225 Output Parameter: 4226 . a - pointer to the device data 4227 4228 Level: developer 4229 4230 .seealso: MatSeqAIJCUSPARSEGetArrayWrite() 4231 @*/ 4232 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4233 { 4234 PetscErrorCode ierr; 4235 4236 PetscFunctionBegin; 4237 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4238 PetscValidPointer(a,2); 4239 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4240 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4241 *a = NULL; 4242 PetscFunctionReturn(0); 4243 } 4244 4245 struct IJCompare4 4246 { 4247 __host__ __device__ 4248 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4249 { 4250 if (t1.get<0>() < t2.get<0>()) return true; 4251 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4252 return false; 4253 } 4254 }; 4255 4256 struct Shift 4257 { 4258 int _shift; 4259 4260 Shift(int shift) : _shift(shift) {} 4261 __host__ __device__ 4262 inline int operator() (const int &c) 4263 { 4264 return c + _shift; 4265 } 4266 }; 4267 4268 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4269 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4270 { 4271 PetscErrorCode ierr; 4272 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4273 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4274 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4275 CsrMatrix *Acsr,*Bcsr,*Ccsr; 4276 PetscInt Annz,Bnnz; 4277 cusparseStatus_t stat; 4278 PetscInt i,m,n,zero = 0; 4279 cudaError_t cerr; 4280 4281 PetscFunctionBegin; 4282 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4283 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4284 PetscValidPointer(C,4); 4285 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4286 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 4287 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 4288 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 4289 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4290 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4291 if (reuse == MAT_INITIAL_MATRIX) { 4292 m = A->rmap->n; 4293 n = A->cmap->n + B->cmap->n; 4294 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 4295 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 4296 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 4297 c = (Mat_SeqAIJ*)(*C)->data; 4298 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4299 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4300 Ccsr = new CsrMatrix; 4301 Cmat->cprowIndices = NULL; 4302 c->compressedrow.use = PETSC_FALSE; 4303 c->compressedrow.nrows = 0; 4304 c->compressedrow.i = NULL; 4305 c->compressedrow.rindex = NULL; 4306 Ccusp->workVector = NULL; 4307 Ccusp->nrows = m; 4308 Ccusp->mat = Cmat; 4309 Ccusp->mat->mat = Ccsr; 4310 Ccsr->num_rows = m; 4311 Ccsr->num_cols = n; 4312 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 4313 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4314 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4315 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4316 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4317 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4318 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4319 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4320 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4321 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4322 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4323 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4324 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4325 4326 Acsr = (CsrMatrix*)Acusp->mat->mat; 4327 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4328 Annz = (PetscInt)Acsr->column_indices->size(); 4329 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4330 c->nz = Annz + Bnnz; 4331 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4332 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4333 Ccsr->values = new THRUSTARRAY(c->nz); 4334 Ccsr->num_entries = c->nz; 4335 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4336 if (c->nz) { 4337 auto Acoo = new THRUSTINTARRAY32(Annz); 4338 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4339 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4340 THRUSTINTARRAY32 *Aroff,*Broff; 4341 4342 if (a->compressedrow.use) { /* need full row offset */ 4343 if (!Acusp->rowoffsets_gpu) { 4344 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4345 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4346 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4347 } 4348 Aroff = Acusp->rowoffsets_gpu; 4349 } else Aroff = Acsr->row_offsets; 4350 if (b->compressedrow.use) { /* need full row offset */ 4351 if (!Bcusp->rowoffsets_gpu) { 4352 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4353 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4354 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4355 } 4356 Broff = Bcusp->rowoffsets_gpu; 4357 } else Broff = Bcsr->row_offsets; 4358 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4359 stat = cusparseXcsr2coo(Acusp->handle, 4360 Aroff->data().get(), 4361 Annz, 4362 m, 4363 Acoo->data().get(), 4364 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4365 stat = cusparseXcsr2coo(Bcusp->handle, 4366 Broff->data().get(), 4367 Bnnz, 4368 m, 4369 Bcoo->data().get(), 4370 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4371 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4372 auto Aperm = thrust::make_constant_iterator(1); 4373 auto Bperm = thrust::make_constant_iterator(0); 4374 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4375 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4376 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4377 #else 4378 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4379 auto Bcib = Bcsr->column_indices->begin(); 4380 auto Bcie = Bcsr->column_indices->end(); 4381 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4382 #endif 4383 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4384 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4385 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4386 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4387 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4388 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4389 auto p1 = Ccusp->cooPerm->begin(); 4390 auto p2 = Ccusp->cooPerm->begin(); 4391 thrust::advance(p2,Annz); 4392 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4393 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4394 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4395 #endif 4396 auto cci = thrust::make_counting_iterator(zero); 4397 auto cce = thrust::make_counting_iterator(c->nz); 4398 #if 0 //Errors on SUMMIT cuda 11.1.0 4399 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4400 #else 4401 auto pred = thrust::identity<int>(); 4402 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4403 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4404 #endif 4405 stat = cusparseXcoo2csr(Ccusp->handle, 4406 Ccoo->data().get(), 4407 c->nz, 4408 m, 4409 Ccsr->row_offsets->data().get(), 4410 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4411 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4412 delete wPerm; 4413 delete Acoo; 4414 delete Bcoo; 4415 delete Ccoo; 4416 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4417 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4418 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4419 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4420 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4421 #endif 4422 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4423 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 4424 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 4425 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4426 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4427 CsrMatrix *CcsrT = new CsrMatrix; 4428 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4429 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4430 4431 (*C)->form_explicit_transpose = PETSC_TRUE; 4432 (*C)->transupdated = PETSC_TRUE; 4433 Ccusp->rowoffsets_gpu = NULL; 4434 CmatT->cprowIndices = NULL; 4435 CmatT->mat = CcsrT; 4436 CcsrT->num_rows = n; 4437 CcsrT->num_cols = m; 4438 CcsrT->num_entries = c->nz; 4439 4440 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4441 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4442 CcsrT->values = new THRUSTARRAY(c->nz); 4443 4444 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4445 auto rT = CcsrT->row_offsets->begin(); 4446 if (AT) { 4447 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4448 thrust::advance(rT,-1); 4449 } 4450 if (BT) { 4451 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4452 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4453 thrust::copy(titb,tite,rT); 4454 } 4455 auto cT = CcsrT->column_indices->begin(); 4456 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4457 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4458 auto vT = CcsrT->values->begin(); 4459 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4460 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4461 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4462 4463 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4464 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4465 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4466 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4467 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4468 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4469 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4470 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4471 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4472 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4473 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4474 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4475 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4476 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4477 #endif 4478 Ccusp->matTranspose = CmatT; 4479 } 4480 } 4481 4482 c->singlemalloc = PETSC_FALSE; 4483 c->free_a = PETSC_TRUE; 4484 c->free_ij = PETSC_TRUE; 4485 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4486 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4487 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4488 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4489 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4490 ii = *Ccsr->row_offsets; 4491 jj = *Ccsr->column_indices; 4492 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4493 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4494 } else { 4495 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4496 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4497 } 4498 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4499 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4500 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4501 c->maxnz = c->nz; 4502 c->nonzerorowcnt = 0; 4503 c->rmax = 0; 4504 for (i = 0; i < m; i++) { 4505 const PetscInt nn = c->i[i+1] - c->i[i]; 4506 c->ilen[i] = c->imax[i] = nn; 4507 c->nonzerorowcnt += (PetscInt)!!nn; 4508 c->rmax = PetscMax(c->rmax,nn); 4509 } 4510 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4511 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4512 (*C)->nonzerostate++; 4513 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4514 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4515 Ccusp->nonzerostate = (*C)->nonzerostate; 4516 (*C)->preallocated = PETSC_TRUE; 4517 } else { 4518 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4519 c = (Mat_SeqAIJ*)(*C)->data; 4520 if (c->nz) { 4521 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4522 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4523 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4524 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4525 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4526 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4527 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4528 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4529 Acsr = (CsrMatrix*)Acusp->mat->mat; 4530 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4531 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4532 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4533 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4534 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4535 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4536 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4537 auto pmid = Ccusp->cooPerm->begin(); 4538 thrust::advance(pmid,Acsr->num_entries); 4539 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4540 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4541 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4542 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4543 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4544 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4545 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4546 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4547 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4548 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4549 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4550 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4551 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4552 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4553 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4554 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4555 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4556 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4557 auto vT = CcsrT->values->begin(); 4558 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4559 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4560 (*C)->transupdated = PETSC_TRUE; 4561 } 4562 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4563 } 4564 } 4565 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4566 (*C)->assembled = PETSC_TRUE; 4567 (*C)->was_assembled = PETSC_FALSE; 4568 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4569 PetscFunctionReturn(0); 4570 } 4571 4572 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4573 { 4574 PetscErrorCode ierr; 4575 bool dmem; 4576 const PetscScalar *av; 4577 cudaError_t cerr; 4578 4579 PetscFunctionBegin; 4580 dmem = isCudaMem(v); 4581 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4582 if (n && idx) { 4583 THRUSTINTARRAY widx(n); 4584 widx.assign(idx,idx+n); 4585 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4586 4587 THRUSTARRAY *w = NULL; 4588 thrust::device_ptr<PetscScalar> dv; 4589 if (dmem) { 4590 dv = thrust::device_pointer_cast(v); 4591 } else { 4592 w = new THRUSTARRAY(n); 4593 dv = w->data(); 4594 } 4595 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4596 4597 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4598 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4599 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4600 if (w) { 4601 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4602 } 4603 delete w; 4604 } else { 4605 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4606 } 4607 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4608 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4609 PetscFunctionReturn(0); 4610 } 4611