1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/async/for_each.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 76 77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 82 83 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 85 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 86 87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 89 90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 91 92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 93 { 94 cusparseStatus_t stat; 95 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 96 97 PetscFunctionBegin; 98 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 99 cusparsestruct->stream = stream; 100 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 101 PetscFunctionReturn(0); 102 } 103 104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 105 { 106 cusparseStatus_t stat; 107 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 108 109 PetscFunctionBegin; 110 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 111 if (cusparsestruct->handle != handle) { 112 if (cusparsestruct->handle) { 113 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 114 } 115 cusparsestruct->handle = handle; 116 } 117 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 118 PetscFunctionReturn(0); 119 } 120 121 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 122 { 123 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 124 PetscBool flg; 125 PetscErrorCode ierr; 126 127 PetscFunctionBegin; 128 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 129 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 130 if (cusparsestruct->handle) cusparsestruct->handle = 0; 131 PetscFunctionReturn(0); 132 } 133 134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 135 { 136 PetscFunctionBegin; 137 *type = MATSOLVERCUSPARSE; 138 PetscFunctionReturn(0); 139 } 140 141 /*MC 142 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 143 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 144 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 145 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 146 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 147 algorithms are not recommended. This class does NOT support direct solver operations. 148 149 Level: beginner 150 151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 152 M*/ 153 154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 155 { 156 PetscErrorCode ierr; 157 PetscInt n = A->rmap->n; 158 159 PetscFunctionBegin; 160 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 161 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 162 (*B)->factortype = ftype; 163 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 164 165 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 166 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 167 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 168 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 169 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 170 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 171 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 172 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 173 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 174 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 175 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 176 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 177 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 178 179 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 180 (*B)->canuseordering = PETSC_TRUE; 181 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 182 PetscFunctionReturn(0); 183 } 184 185 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 186 { 187 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 188 189 PetscFunctionBegin; 190 switch (op) { 191 case MAT_CUSPARSE_MULT: 192 cusparsestruct->format = format; 193 break; 194 case MAT_CUSPARSE_ALL: 195 cusparsestruct->format = format; 196 break; 197 default: 198 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 199 } 200 PetscFunctionReturn(0); 201 } 202 203 /*@ 204 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 205 operation. Only the MatMult operation can use different GPU storage formats 206 for MPIAIJCUSPARSE matrices. 207 Not Collective 208 209 Input Parameters: 210 + A - Matrix of type SEQAIJCUSPARSE 211 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 212 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 213 214 Output Parameter: 215 216 Level: intermediate 217 218 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 219 @*/ 220 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 221 { 222 PetscErrorCode ierr; 223 224 PetscFunctionBegin; 225 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 226 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 227 PetscFunctionReturn(0); 228 } 229 230 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 231 { 232 PetscErrorCode ierr; 233 234 PetscFunctionBegin; 235 switch (op) { 236 case MAT_FORM_EXPLICIT_TRANSPOSE: 237 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 238 if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 239 A->form_explicit_transpose = flg; 240 break; 241 default: 242 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 243 break; 244 } 245 PetscFunctionReturn(0); 246 } 247 248 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 249 250 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 251 { 252 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 253 IS isrow = b->row,iscol = b->col; 254 PetscBool row_identity,col_identity; 255 PetscErrorCode ierr; 256 257 PetscFunctionBegin; 258 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 259 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 260 B->offloadmask = PETSC_OFFLOAD_CPU; 261 /* determine which version of MatSolve needs to be used. */ 262 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 263 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 264 if (row_identity && col_identity) { 265 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 266 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 267 B->ops->matsolve = NULL; 268 B->ops->matsolvetranspose = NULL; 269 } else { 270 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 271 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 272 B->ops->matsolve = NULL; 273 B->ops->matsolvetranspose = NULL; 274 } 275 276 /* get the triangular factors */ 277 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 278 PetscFunctionReturn(0); 279 } 280 281 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 282 { 283 PetscErrorCode ierr; 284 MatCUSPARSEStorageFormat format; 285 PetscBool flg; 286 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 287 288 PetscFunctionBegin; 289 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 290 if (A->factortype == MAT_FACTOR_NONE) { 291 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 292 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 293 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 294 295 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 296 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 297 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 298 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 299 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 300 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 301 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 302 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 303 304 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 305 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 306 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 307 308 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 309 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 310 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 311 #endif 312 } 313 ierr = PetscOptionsTail();CHKERRQ(ierr); 314 PetscFunctionReturn(0); 315 } 316 317 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 318 { 319 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 320 PetscErrorCode ierr; 321 322 PetscFunctionBegin; 323 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 324 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 325 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 326 PetscFunctionReturn(0); 327 } 328 329 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 330 { 331 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 332 PetscErrorCode ierr; 333 334 PetscFunctionBegin; 335 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 336 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 337 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 338 PetscFunctionReturn(0); 339 } 340 341 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 342 { 343 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 344 PetscErrorCode ierr; 345 346 PetscFunctionBegin; 347 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 348 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 349 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 350 PetscFunctionReturn(0); 351 } 352 353 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 354 { 355 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 356 PetscErrorCode ierr; 357 358 PetscFunctionBegin; 359 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 360 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 361 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 362 PetscFunctionReturn(0); 363 } 364 365 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 366 { 367 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 368 PetscInt n = A->rmap->n; 369 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 370 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 371 cusparseStatus_t stat; 372 const PetscInt *ai = a->i,*aj = a->j,*vi; 373 const MatScalar *aa = a->a,*v; 374 PetscInt *AiLo, *AjLo; 375 PetscInt i,nz, nzLower, offset, rowOffset; 376 PetscErrorCode ierr; 377 cudaError_t cerr; 378 379 PetscFunctionBegin; 380 if (!n) PetscFunctionReturn(0); 381 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 382 try { 383 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 384 nzLower=n+ai[n]-ai[1]; 385 if (!loTriFactor) { 386 PetscScalar *AALo; 387 388 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 389 390 /* Allocate Space for the lower triangular matrix */ 391 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 392 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 393 394 /* Fill the lower triangular matrix */ 395 AiLo[0] = (PetscInt) 0; 396 AiLo[n] = nzLower; 397 AjLo[0] = (PetscInt) 0; 398 AALo[0] = (MatScalar) 1.0; 399 v = aa; 400 vi = aj; 401 offset = 1; 402 rowOffset= 1; 403 for (i=1; i<n; i++) { 404 nz = ai[i+1] - ai[i]; 405 /* additional 1 for the term on the diagonal */ 406 AiLo[i] = rowOffset; 407 rowOffset += nz+1; 408 409 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 410 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 411 412 offset += nz; 413 AjLo[offset] = (PetscInt) i; 414 AALo[offset] = (MatScalar) 1.0; 415 offset += 1; 416 417 v += nz; 418 vi += nz; 419 } 420 421 /* allocate space for the triangular factor information */ 422 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 423 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 424 /* Create the matrix description */ 425 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 426 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 427 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 428 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 429 #else 430 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 431 #endif 432 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 433 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 434 435 /* set the operation */ 436 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 437 438 /* set the matrix */ 439 loTriFactor->csrMat = new CsrMatrix; 440 loTriFactor->csrMat->num_rows = n; 441 loTriFactor->csrMat->num_cols = n; 442 loTriFactor->csrMat->num_entries = nzLower; 443 444 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 445 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 446 447 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 448 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 449 450 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 451 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 452 453 /* Create the solve analysis information */ 454 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 455 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 456 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 457 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 458 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 459 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 460 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 461 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 462 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 463 #endif 464 465 /* perform the solve analysis */ 466 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 467 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 468 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 469 loTriFactor->csrMat->column_indices->data().get(), 470 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 471 loTriFactor->solveInfo, 472 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 473 #else 474 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 475 #endif 476 cerr = WaitForCUDA();CHKERRCUDA(cerr); 477 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 478 479 /* assign the pointer */ 480 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 481 loTriFactor->AA_h = AALo; 482 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 483 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 484 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 485 } else { /* update values only */ 486 if (!loTriFactor->AA_h) { 487 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 488 } 489 /* Fill the lower triangular matrix */ 490 loTriFactor->AA_h[0] = 1.0; 491 v = aa; 492 vi = aj; 493 offset = 1; 494 for (i=1; i<n; i++) { 495 nz = ai[i+1] - ai[i]; 496 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 497 offset += nz; 498 loTriFactor->AA_h[offset] = 1.0; 499 offset += 1; 500 v += nz; 501 } 502 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 503 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 504 } 505 } catch(char *ex) { 506 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 507 } 508 } 509 PetscFunctionReturn(0); 510 } 511 512 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 513 { 514 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 515 PetscInt n = A->rmap->n; 516 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 517 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 518 cusparseStatus_t stat; 519 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 520 const MatScalar *aa = a->a,*v; 521 PetscInt *AiUp, *AjUp; 522 PetscInt i,nz, nzUpper, offset; 523 PetscErrorCode ierr; 524 cudaError_t cerr; 525 526 PetscFunctionBegin; 527 if (!n) PetscFunctionReturn(0); 528 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 529 try { 530 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 531 nzUpper = adiag[0]-adiag[n]; 532 if (!upTriFactor) { 533 PetscScalar *AAUp; 534 535 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 536 537 /* Allocate Space for the upper triangular matrix */ 538 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 539 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 540 541 /* Fill the upper triangular matrix */ 542 AiUp[0]=(PetscInt) 0; 543 AiUp[n]=nzUpper; 544 offset = nzUpper; 545 for (i=n-1; i>=0; i--) { 546 v = aa + adiag[i+1] + 1; 547 vi = aj + adiag[i+1] + 1; 548 549 /* number of elements NOT on the diagonal */ 550 nz = adiag[i] - adiag[i+1]-1; 551 552 /* decrement the offset */ 553 offset -= (nz+1); 554 555 /* first, set the diagonal elements */ 556 AjUp[offset] = (PetscInt) i; 557 AAUp[offset] = (MatScalar)1./v[nz]; 558 AiUp[i] = AiUp[i+1] - (nz+1); 559 560 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 561 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 562 } 563 564 /* allocate space for the triangular factor information */ 565 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 566 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 567 568 /* Create the matrix description */ 569 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 570 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 571 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 572 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 573 #else 574 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 575 #endif 576 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 577 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 578 579 /* set the operation */ 580 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 581 582 /* set the matrix */ 583 upTriFactor->csrMat = new CsrMatrix; 584 upTriFactor->csrMat->num_rows = n; 585 upTriFactor->csrMat->num_cols = n; 586 upTriFactor->csrMat->num_entries = nzUpper; 587 588 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 589 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 590 591 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 592 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 593 594 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 595 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 596 597 /* Create the solve analysis information */ 598 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 599 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 600 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 601 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 602 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 603 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 604 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 605 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 606 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 607 #endif 608 609 /* perform the solve analysis */ 610 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 611 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 612 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 613 upTriFactor->csrMat->column_indices->data().get(), 614 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 615 upTriFactor->solveInfo, 616 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 617 #else 618 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 619 #endif 620 cerr = WaitForCUDA();CHKERRCUDA(cerr); 621 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 622 623 /* assign the pointer */ 624 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 625 upTriFactor->AA_h = AAUp; 626 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 627 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 628 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 629 } else { 630 if (!upTriFactor->AA_h) { 631 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 632 } 633 /* Fill the upper triangular matrix */ 634 offset = nzUpper; 635 for (i=n-1; i>=0; i--) { 636 v = aa + adiag[i+1] + 1; 637 638 /* number of elements NOT on the diagonal */ 639 nz = adiag[i] - adiag[i+1]-1; 640 641 /* decrement the offset */ 642 offset -= (nz+1); 643 644 /* first, set the diagonal elements */ 645 upTriFactor->AA_h[offset] = 1./v[nz]; 646 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 647 } 648 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 649 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 650 } 651 } catch(char *ex) { 652 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 653 } 654 } 655 PetscFunctionReturn(0); 656 } 657 658 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 659 { 660 PetscErrorCode ierr; 661 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 662 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 663 IS isrow = a->row,iscol = a->icol; 664 PetscBool row_identity,col_identity; 665 PetscInt n = A->rmap->n; 666 667 PetscFunctionBegin; 668 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 669 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 670 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 671 672 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 673 cusparseTriFactors->nnz=a->nz; 674 675 A->offloadmask = PETSC_OFFLOAD_BOTH; 676 /* lower triangular indices */ 677 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 678 if (!row_identity && !cusparseTriFactors->rpermIndices) { 679 const PetscInt *r; 680 681 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 682 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 683 cusparseTriFactors->rpermIndices->assign(r, r+n); 684 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 685 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 686 } 687 688 /* upper triangular indices */ 689 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 690 if (!col_identity && !cusparseTriFactors->cpermIndices) { 691 const PetscInt *c; 692 693 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 694 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 695 cusparseTriFactors->cpermIndices->assign(c, c+n); 696 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 697 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 698 } 699 PetscFunctionReturn(0); 700 } 701 702 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 703 { 704 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 705 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 706 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 707 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 708 cusparseStatus_t stat; 709 PetscErrorCode ierr; 710 cudaError_t cerr; 711 PetscInt *AiUp, *AjUp; 712 PetscScalar *AAUp; 713 PetscScalar *AALo; 714 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 715 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 716 const PetscInt *ai = b->i,*aj = b->j,*vj; 717 const MatScalar *aa = b->a,*v; 718 719 PetscFunctionBegin; 720 if (!n) PetscFunctionReturn(0); 721 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 722 try { 723 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 724 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 725 if (!upTriFactor && !loTriFactor) { 726 /* Allocate Space for the upper triangular matrix */ 727 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 728 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 729 730 /* Fill the upper triangular matrix */ 731 AiUp[0]=(PetscInt) 0; 732 AiUp[n]=nzUpper; 733 offset = 0; 734 for (i=0; i<n; i++) { 735 /* set the pointers */ 736 v = aa + ai[i]; 737 vj = aj + ai[i]; 738 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 739 740 /* first, set the diagonal elements */ 741 AjUp[offset] = (PetscInt) i; 742 AAUp[offset] = (MatScalar)1.0/v[nz]; 743 AiUp[i] = offset; 744 AALo[offset] = (MatScalar)1.0/v[nz]; 745 746 offset+=1; 747 if (nz>0) { 748 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 749 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 750 for (j=offset; j<offset+nz; j++) { 751 AAUp[j] = -AAUp[j]; 752 AALo[j] = AAUp[j]/v[nz]; 753 } 754 offset+=nz; 755 } 756 } 757 758 /* allocate space for the triangular factor information */ 759 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 760 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 761 762 /* Create the matrix description */ 763 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 764 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 765 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 766 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 767 #else 768 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 769 #endif 770 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 771 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 772 773 /* set the matrix */ 774 upTriFactor->csrMat = new CsrMatrix; 775 upTriFactor->csrMat->num_rows = A->rmap->n; 776 upTriFactor->csrMat->num_cols = A->cmap->n; 777 upTriFactor->csrMat->num_entries = a->nz; 778 779 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 780 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 781 782 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 783 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 784 785 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 786 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 787 788 /* set the operation */ 789 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 790 791 /* Create the solve analysis information */ 792 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 793 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 794 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 795 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 796 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 797 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 798 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 799 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 800 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 801 #endif 802 803 /* perform the solve analysis */ 804 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 805 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 806 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 807 upTriFactor->csrMat->column_indices->data().get(), 808 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 809 upTriFactor->solveInfo, 810 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 811 #else 812 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 813 #endif 814 cerr = WaitForCUDA();CHKERRCUDA(cerr); 815 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 816 817 /* assign the pointer */ 818 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 819 820 /* allocate space for the triangular factor information */ 821 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 822 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 823 824 /* Create the matrix description */ 825 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 826 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 827 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 828 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 829 #else 830 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 831 #endif 832 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 833 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 834 835 /* set the operation */ 836 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 837 838 /* set the matrix */ 839 loTriFactor->csrMat = new CsrMatrix; 840 loTriFactor->csrMat->num_rows = A->rmap->n; 841 loTriFactor->csrMat->num_cols = A->cmap->n; 842 loTriFactor->csrMat->num_entries = a->nz; 843 844 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 845 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 846 847 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 848 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 849 850 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 851 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 852 853 /* Create the solve analysis information */ 854 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 855 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 856 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 857 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 858 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 859 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 860 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 861 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 862 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 863 #endif 864 865 /* perform the solve analysis */ 866 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 867 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 868 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 869 loTriFactor->csrMat->column_indices->data().get(), 870 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 871 loTriFactor->solveInfo, 872 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 873 #else 874 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 875 #endif 876 cerr = WaitForCUDA();CHKERRCUDA(cerr); 877 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 878 879 /* assign the pointer */ 880 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 881 882 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 883 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 884 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 885 } else { 886 /* Fill the upper triangular matrix */ 887 offset = 0; 888 for (i=0; i<n; i++) { 889 /* set the pointers */ 890 v = aa + ai[i]; 891 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 892 893 /* first, set the diagonal elements */ 894 AAUp[offset] = 1.0/v[nz]; 895 AALo[offset] = 1.0/v[nz]; 896 897 offset+=1; 898 if (nz>0) { 899 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 900 for (j=offset; j<offset+nz; j++) { 901 AAUp[j] = -AAUp[j]; 902 AALo[j] = AAUp[j]/v[nz]; 903 } 904 offset+=nz; 905 } 906 } 907 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 908 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 909 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 910 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 911 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 912 } 913 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 914 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 915 } catch(char *ex) { 916 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 917 } 918 } 919 PetscFunctionReturn(0); 920 } 921 922 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 923 { 924 PetscErrorCode ierr; 925 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 926 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 927 IS ip = a->row; 928 PetscBool perm_identity; 929 PetscInt n = A->rmap->n; 930 931 PetscFunctionBegin; 932 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 933 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 934 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 935 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 936 937 A->offloadmask = PETSC_OFFLOAD_BOTH; 938 939 /* lower triangular indices */ 940 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 941 if (!perm_identity) { 942 IS iip; 943 const PetscInt *irip,*rip; 944 945 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 946 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 947 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 948 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 949 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 950 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 951 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 952 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 953 ierr = ISDestroy(&iip);CHKERRQ(ierr); 954 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 955 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 956 } 957 PetscFunctionReturn(0); 958 } 959 960 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 961 { 962 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 963 IS ip = b->row; 964 PetscBool perm_identity; 965 PetscErrorCode ierr; 966 967 PetscFunctionBegin; 968 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 969 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 970 B->offloadmask = PETSC_OFFLOAD_CPU; 971 /* determine which version of MatSolve needs to be used. */ 972 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 973 if (perm_identity) { 974 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 975 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 976 B->ops->matsolve = NULL; 977 B->ops->matsolvetranspose = NULL; 978 } else { 979 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 980 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 981 B->ops->matsolve = NULL; 982 B->ops->matsolvetranspose = NULL; 983 } 984 985 /* get the triangular factors */ 986 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 987 PetscFunctionReturn(0); 988 } 989 990 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 991 { 992 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 993 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 994 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 995 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 996 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 997 cusparseStatus_t stat; 998 cusparseIndexBase_t indexBase; 999 cusparseMatrixType_t matrixType; 1000 cusparseFillMode_t fillMode; 1001 cusparseDiagType_t diagType; 1002 cudaError_t cerr; 1003 PetscErrorCode ierr; 1004 1005 PetscFunctionBegin; 1006 /* allocate space for the transpose of the lower triangular factor */ 1007 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1008 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1009 1010 /* set the matrix descriptors of the lower triangular factor */ 1011 matrixType = cusparseGetMatType(loTriFactor->descr); 1012 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1013 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1014 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1015 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1016 1017 /* Create the matrix description */ 1018 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1019 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1020 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1021 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1022 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1023 1024 /* set the operation */ 1025 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1026 1027 /* allocate GPU space for the CSC of the lower triangular factor*/ 1028 loTriFactorT->csrMat = new CsrMatrix; 1029 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1030 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1031 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1032 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1033 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1034 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1035 1036 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1037 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1038 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1039 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1040 loTriFactor->csrMat->values->data().get(), 1041 loTriFactor->csrMat->row_offsets->data().get(), 1042 loTriFactor->csrMat->column_indices->data().get(), 1043 loTriFactorT->csrMat->values->data().get(), 1044 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1045 CUSPARSE_ACTION_NUMERIC,indexBase, 1046 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1047 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1048 #endif 1049 1050 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1051 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1052 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1053 loTriFactor->csrMat->values->data().get(), 1054 loTriFactor->csrMat->row_offsets->data().get(), 1055 loTriFactor->csrMat->column_indices->data().get(), 1056 loTriFactorT->csrMat->values->data().get(), 1057 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1058 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1059 CUSPARSE_ACTION_NUMERIC, indexBase, 1060 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1061 #else 1062 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1063 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1064 #endif 1065 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1066 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1067 1068 /* Create the solve analysis information */ 1069 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1070 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1071 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1072 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1073 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1074 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1075 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1076 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1077 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1078 #endif 1079 1080 /* perform the solve analysis */ 1081 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1082 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1083 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1084 loTriFactorT->csrMat->column_indices->data().get(), 1085 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1086 loTriFactorT->solveInfo, 1087 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1088 #else 1089 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1090 #endif 1091 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1092 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1093 1094 /* assign the pointer */ 1095 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1096 1097 /*********************************************/ 1098 /* Now the Transpose of the Upper Tri Factor */ 1099 /*********************************************/ 1100 1101 /* allocate space for the transpose of the upper triangular factor */ 1102 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1103 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1104 1105 /* set the matrix descriptors of the upper triangular factor */ 1106 matrixType = cusparseGetMatType(upTriFactor->descr); 1107 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1108 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1109 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1110 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1111 1112 /* Create the matrix description */ 1113 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1114 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1115 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1116 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1117 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1118 1119 /* set the operation */ 1120 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1121 1122 /* allocate GPU space for the CSC of the upper triangular factor*/ 1123 upTriFactorT->csrMat = new CsrMatrix; 1124 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1125 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1126 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1127 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1128 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1129 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1130 1131 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1132 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1133 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1134 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1135 upTriFactor->csrMat->values->data().get(), 1136 upTriFactor->csrMat->row_offsets->data().get(), 1137 upTriFactor->csrMat->column_indices->data().get(), 1138 upTriFactorT->csrMat->values->data().get(), 1139 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1140 CUSPARSE_ACTION_NUMERIC,indexBase, 1141 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1142 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1143 #endif 1144 1145 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1146 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1147 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1148 upTriFactor->csrMat->values->data().get(), 1149 upTriFactor->csrMat->row_offsets->data().get(), 1150 upTriFactor->csrMat->column_indices->data().get(), 1151 upTriFactorT->csrMat->values->data().get(), 1152 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1153 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1154 CUSPARSE_ACTION_NUMERIC, indexBase, 1155 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1156 #else 1157 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1158 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1159 #endif 1160 1161 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1162 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1163 1164 /* Create the solve analysis information */ 1165 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1166 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1167 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1168 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1169 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1170 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1171 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1172 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1173 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1174 #endif 1175 1176 /* perform the solve analysis */ 1177 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1178 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1179 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1180 upTriFactorT->csrMat->column_indices->data().get(), 1181 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1182 upTriFactorT->solveInfo, 1183 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1184 #else 1185 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1186 #endif 1187 1188 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1189 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1190 1191 /* assign the pointer */ 1192 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1193 PetscFunctionReturn(0); 1194 } 1195 1196 struct PetscScalarToPetscInt 1197 { 1198 __host__ __device__ 1199 PetscInt operator()(PetscScalar s) 1200 { 1201 return (PetscInt)PetscRealPart(s); 1202 } 1203 }; 1204 1205 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A) 1206 { 1207 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1208 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1209 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1210 cusparseStatus_t stat; 1211 cusparseIndexBase_t indexBase; 1212 cudaError_t err; 1213 PetscErrorCode ierr; 1214 1215 PetscFunctionBegin; 1216 if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 1217 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1218 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1219 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1220 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1221 if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1222 if (A->transupdated) PetscFunctionReturn(0); 1223 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1224 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1225 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1226 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1227 } 1228 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1229 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1230 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1231 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1232 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1233 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1234 1235 /* set alpha and beta */ 1236 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1237 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1238 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1239 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1240 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1241 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1242 1243 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1244 CsrMatrix *matrixT = new CsrMatrix; 1245 matstructT->mat = matrixT; 1246 matrixT->num_rows = A->cmap->n; 1247 matrixT->num_cols = A->rmap->n; 1248 matrixT->num_entries = a->nz; 1249 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1250 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1251 matrixT->values = new THRUSTARRAY(a->nz); 1252 1253 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1254 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1255 1256 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1257 stat = cusparseCreateCsr(&matstructT->matDescr, 1258 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1259 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1260 matrixT->values->data().get(), 1261 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1262 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1263 #endif 1264 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1265 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1266 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1267 #else 1268 CsrMatrix *temp = new CsrMatrix; 1269 CsrMatrix *tempT = new CsrMatrix; 1270 /* First convert HYB to CSR */ 1271 temp->num_rows = A->rmap->n; 1272 temp->num_cols = A->cmap->n; 1273 temp->num_entries = a->nz; 1274 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1275 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1276 temp->values = new THRUSTARRAY(a->nz); 1277 1278 stat = cusparse_hyb2csr(cusparsestruct->handle, 1279 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1280 temp->values->data().get(), 1281 temp->row_offsets->data().get(), 1282 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1283 1284 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1285 tempT->num_rows = A->rmap->n; 1286 tempT->num_cols = A->cmap->n; 1287 tempT->num_entries = a->nz; 1288 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1289 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1290 tempT->values = new THRUSTARRAY(a->nz); 1291 1292 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1293 temp->num_cols, temp->num_entries, 1294 temp->values->data().get(), 1295 temp->row_offsets->data().get(), 1296 temp->column_indices->data().get(), 1297 tempT->values->data().get(), 1298 tempT->column_indices->data().get(), 1299 tempT->row_offsets->data().get(), 1300 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1301 1302 /* Last, convert CSC to HYB */ 1303 cusparseHybMat_t hybMat; 1304 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1305 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1306 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1307 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1308 matstructT->descr, tempT->values->data().get(), 1309 tempT->row_offsets->data().get(), 1310 tempT->column_indices->data().get(), 1311 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1312 1313 /* assign the pointer */ 1314 matstructT->mat = hybMat; 1315 A->transupdated = PETSC_TRUE; 1316 /* delete temporaries */ 1317 if (tempT) { 1318 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1319 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1320 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1321 delete (CsrMatrix*) tempT; 1322 } 1323 if (temp) { 1324 if (temp->values) delete (THRUSTARRAY*) temp->values; 1325 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1326 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1327 delete (CsrMatrix*) temp; 1328 } 1329 #endif 1330 } 1331 } 1332 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1333 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1334 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1335 if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1336 if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1337 if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1338 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1339 if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1340 if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1341 if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1342 if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1343 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1344 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1345 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1346 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1347 } 1348 if (!cusparsestruct->csr2csc_i) { 1349 THRUSTARRAY csr2csc_a(matrix->num_entries); 1350 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1351 1352 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1353 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1354 void *csr2cscBuffer; 1355 size_t csr2cscBufferSize; 1356 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1357 A->cmap->n, matrix->num_entries, 1358 matrix->values->data().get(), 1359 cusparsestruct->rowoffsets_gpu->data().get(), 1360 matrix->column_indices->data().get(), 1361 matrixT->values->data().get(), 1362 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1363 CUSPARSE_ACTION_NUMERIC,indexBase, 1364 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1365 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1366 #endif 1367 1368 if (matrix->num_entries) { 1369 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1370 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1371 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1372 1373 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1374 should be filled with indexBase. So I just take a shortcut here. 1375 */ 1376 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1377 A->cmap->n,matrix->num_entries, 1378 csr2csc_a.data().get(), 1379 cusparsestruct->rowoffsets_gpu->data().get(), 1380 matrix->column_indices->data().get(), 1381 matrixT->values->data().get(), 1382 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1383 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1384 CUSPARSE_ACTION_NUMERIC,indexBase, 1385 cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1386 #else 1387 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1388 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1389 #endif 1390 } else { 1391 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1392 } 1393 1394 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1395 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1396 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1397 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1398 #endif 1399 } 1400 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1401 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1402 matrixT->values->begin())); 1403 } 1404 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1405 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1406 /* the compressed row indices is not used for matTranspose */ 1407 matstructT->cprowIndices = NULL; 1408 /* assign the pointer */ 1409 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1410 A->transupdated = PETSC_TRUE; 1411 PetscFunctionReturn(0); 1412 } 1413 1414 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1415 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1416 { 1417 PetscInt n = xx->map->n; 1418 const PetscScalar *barray; 1419 PetscScalar *xarray; 1420 thrust::device_ptr<const PetscScalar> bGPU; 1421 thrust::device_ptr<PetscScalar> xGPU; 1422 cusparseStatus_t stat; 1423 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1424 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1425 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1426 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1427 PetscErrorCode ierr; 1428 1429 PetscFunctionBegin; 1430 /* Analyze the matrix and create the transpose ... on the fly */ 1431 if (!loTriFactorT && !upTriFactorT) { 1432 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1433 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1434 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1435 } 1436 1437 /* Get the GPU pointers */ 1438 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1439 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1440 xGPU = thrust::device_pointer_cast(xarray); 1441 bGPU = thrust::device_pointer_cast(barray); 1442 1443 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1444 /* First, reorder with the row permutation */ 1445 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1446 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1447 xGPU); 1448 1449 /* First, solve U */ 1450 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1451 upTriFactorT->csrMat->num_rows, 1452 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1453 upTriFactorT->csrMat->num_entries, 1454 #endif 1455 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1456 upTriFactorT->csrMat->values->data().get(), 1457 upTriFactorT->csrMat->row_offsets->data().get(), 1458 upTriFactorT->csrMat->column_indices->data().get(), 1459 upTriFactorT->solveInfo, 1460 xarray, 1461 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1462 tempGPU->data().get(), 1463 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1464 #else 1465 tempGPU->data().get());CHKERRCUSPARSE(stat); 1466 #endif 1467 1468 /* Then, solve L */ 1469 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1470 loTriFactorT->csrMat->num_rows, 1471 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1472 loTriFactorT->csrMat->num_entries, 1473 #endif 1474 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1475 loTriFactorT->csrMat->values->data().get(), 1476 loTriFactorT->csrMat->row_offsets->data().get(), 1477 loTriFactorT->csrMat->column_indices->data().get(), 1478 loTriFactorT->solveInfo, 1479 tempGPU->data().get(), 1480 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1481 xarray, 1482 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1483 #else 1484 xarray);CHKERRCUSPARSE(stat); 1485 #endif 1486 1487 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1488 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1489 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1490 tempGPU->begin()); 1491 1492 /* Copy the temporary to the full solution. */ 1493 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1494 1495 /* restore */ 1496 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1497 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1498 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1499 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1500 PetscFunctionReturn(0); 1501 } 1502 1503 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1504 { 1505 const PetscScalar *barray; 1506 PetscScalar *xarray; 1507 cusparseStatus_t stat; 1508 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1509 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1510 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1511 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1512 PetscErrorCode ierr; 1513 1514 PetscFunctionBegin; 1515 /* Analyze the matrix and create the transpose ... on the fly */ 1516 if (!loTriFactorT && !upTriFactorT) { 1517 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1518 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1519 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1520 } 1521 1522 /* Get the GPU pointers */ 1523 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1524 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1525 1526 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1527 /* First, solve U */ 1528 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1529 upTriFactorT->csrMat->num_rows, 1530 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1531 upTriFactorT->csrMat->num_entries, 1532 #endif 1533 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1534 upTriFactorT->csrMat->values->data().get(), 1535 upTriFactorT->csrMat->row_offsets->data().get(), 1536 upTriFactorT->csrMat->column_indices->data().get(), 1537 upTriFactorT->solveInfo, 1538 barray, 1539 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1540 tempGPU->data().get(), 1541 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1542 #else 1543 tempGPU->data().get());CHKERRCUSPARSE(stat); 1544 #endif 1545 1546 /* Then, solve L */ 1547 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1548 loTriFactorT->csrMat->num_rows, 1549 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1550 loTriFactorT->csrMat->num_entries, 1551 #endif 1552 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1553 loTriFactorT->csrMat->values->data().get(), 1554 loTriFactorT->csrMat->row_offsets->data().get(), 1555 loTriFactorT->csrMat->column_indices->data().get(), 1556 loTriFactorT->solveInfo, 1557 tempGPU->data().get(), 1558 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1559 xarray, 1560 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1561 #else 1562 xarray);CHKERRCUSPARSE(stat); 1563 #endif 1564 1565 /* restore */ 1566 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1567 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1568 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1569 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1570 PetscFunctionReturn(0); 1571 } 1572 1573 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1574 { 1575 const PetscScalar *barray; 1576 PetscScalar *xarray; 1577 thrust::device_ptr<const PetscScalar> bGPU; 1578 thrust::device_ptr<PetscScalar> xGPU; 1579 cusparseStatus_t stat; 1580 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1581 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1582 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1583 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1584 PetscErrorCode ierr; 1585 1586 PetscFunctionBegin; 1587 1588 /* Get the GPU pointers */ 1589 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1590 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1591 xGPU = thrust::device_pointer_cast(xarray); 1592 bGPU = thrust::device_pointer_cast(barray); 1593 1594 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1595 /* First, reorder with the row permutation */ 1596 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1597 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1598 tempGPU->begin()); 1599 1600 /* Next, solve L */ 1601 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1602 loTriFactor->csrMat->num_rows, 1603 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1604 loTriFactor->csrMat->num_entries, 1605 #endif 1606 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1607 loTriFactor->csrMat->values->data().get(), 1608 loTriFactor->csrMat->row_offsets->data().get(), 1609 loTriFactor->csrMat->column_indices->data().get(), 1610 loTriFactor->solveInfo, 1611 tempGPU->data().get(), 1612 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1613 xarray, 1614 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1615 #else 1616 xarray);CHKERRCUSPARSE(stat); 1617 #endif 1618 1619 /* Then, solve U */ 1620 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1621 upTriFactor->csrMat->num_rows, 1622 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1623 upTriFactor->csrMat->num_entries, 1624 #endif 1625 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1626 upTriFactor->csrMat->values->data().get(), 1627 upTriFactor->csrMat->row_offsets->data().get(), 1628 upTriFactor->csrMat->column_indices->data().get(), 1629 upTriFactor->solveInfo,xarray, 1630 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1631 tempGPU->data().get(), 1632 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1633 #else 1634 tempGPU->data().get());CHKERRCUSPARSE(stat); 1635 #endif 1636 1637 /* Last, reorder with the column permutation */ 1638 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1639 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1640 xGPU); 1641 1642 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1643 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1644 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1645 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1646 PetscFunctionReturn(0); 1647 } 1648 1649 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1650 { 1651 const PetscScalar *barray; 1652 PetscScalar *xarray; 1653 cusparseStatus_t stat; 1654 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1655 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1656 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1657 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1658 PetscErrorCode ierr; 1659 1660 PetscFunctionBegin; 1661 /* Get the GPU pointers */ 1662 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1663 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1664 1665 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1666 /* First, solve L */ 1667 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1668 loTriFactor->csrMat->num_rows, 1669 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1670 loTriFactor->csrMat->num_entries, 1671 #endif 1672 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1673 loTriFactor->csrMat->values->data().get(), 1674 loTriFactor->csrMat->row_offsets->data().get(), 1675 loTriFactor->csrMat->column_indices->data().get(), 1676 loTriFactor->solveInfo, 1677 barray, 1678 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1679 tempGPU->data().get(), 1680 loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1681 #else 1682 tempGPU->data().get());CHKERRCUSPARSE(stat); 1683 #endif 1684 1685 /* Next, solve U */ 1686 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1687 upTriFactor->csrMat->num_rows, 1688 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1689 upTriFactor->csrMat->num_entries, 1690 #endif 1691 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1692 upTriFactor->csrMat->values->data().get(), 1693 upTriFactor->csrMat->row_offsets->data().get(), 1694 upTriFactor->csrMat->column_indices->data().get(), 1695 upTriFactor->solveInfo, 1696 tempGPU->data().get(), 1697 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1698 xarray, 1699 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1700 #else 1701 xarray);CHKERRCUSPARSE(stat); 1702 #endif 1703 1704 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1705 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1706 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1707 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1708 PetscFunctionReturn(0); 1709 } 1710 1711 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1712 { 1713 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1714 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1715 cudaError_t cerr; 1716 PetscErrorCode ierr; 1717 1718 PetscFunctionBegin; 1719 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1720 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1721 1722 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1723 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1724 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1725 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1726 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1727 A->offloadmask = PETSC_OFFLOAD_BOTH; 1728 } 1729 PetscFunctionReturn(0); 1730 } 1731 1732 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1733 { 1734 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1735 PetscErrorCode ierr; 1736 1737 PetscFunctionBegin; 1738 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1739 *array = a->a; 1740 A->offloadmask = PETSC_OFFLOAD_CPU; 1741 PetscFunctionReturn(0); 1742 } 1743 1744 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1745 { 1746 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1747 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1748 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1749 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1750 PetscErrorCode ierr; 1751 cusparseStatus_t stat; 1752 PetscBool both = PETSC_TRUE; 1753 cudaError_t err; 1754 1755 PetscFunctionBegin; 1756 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1757 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1758 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1759 CsrMatrix *matrix; 1760 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1761 1762 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1763 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1764 matrix->values->assign(a->a, a->a+a->nz); 1765 err = WaitForCUDA();CHKERRCUDA(err); 1766 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1767 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1768 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1769 } else { 1770 PetscInt nnz; 1771 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1772 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1773 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1774 delete cusparsestruct->workVector; 1775 delete cusparsestruct->rowoffsets_gpu; 1776 cusparsestruct->workVector = NULL; 1777 cusparsestruct->rowoffsets_gpu = NULL; 1778 try { 1779 if (a->compressedrow.use) { 1780 m = a->compressedrow.nrows; 1781 ii = a->compressedrow.i; 1782 ridx = a->compressedrow.rindex; 1783 } else { 1784 m = A->rmap->n; 1785 ii = a->i; 1786 ridx = NULL; 1787 } 1788 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1789 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1790 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1791 else nnz = a->nz; 1792 1793 /* create cusparse matrix */ 1794 cusparsestruct->nrows = m; 1795 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1796 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1797 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1798 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1799 1800 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1801 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1802 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1803 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1804 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1805 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1806 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1807 1808 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1809 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1810 /* set the matrix */ 1811 CsrMatrix *mat= new CsrMatrix; 1812 mat->num_rows = m; 1813 mat->num_cols = A->cmap->n; 1814 mat->num_entries = nnz; 1815 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1816 mat->row_offsets->assign(ii, ii + m+1); 1817 1818 mat->column_indices = new THRUSTINTARRAY32(nnz); 1819 mat->column_indices->assign(a->j, a->j+nnz); 1820 1821 mat->values = new THRUSTARRAY(nnz); 1822 if (a->a) mat->values->assign(a->a, a->a+nnz); 1823 1824 /* assign the pointer */ 1825 matstruct->mat = mat; 1826 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1827 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1828 stat = cusparseCreateCsr(&matstruct->matDescr, 1829 mat->num_rows, mat->num_cols, mat->num_entries, 1830 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1831 mat->values->data().get(), 1832 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1833 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1834 } 1835 #endif 1836 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1837 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1838 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1839 #else 1840 CsrMatrix *mat= new CsrMatrix; 1841 mat->num_rows = m; 1842 mat->num_cols = A->cmap->n; 1843 mat->num_entries = nnz; 1844 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1845 mat->row_offsets->assign(ii, ii + m+1); 1846 1847 mat->column_indices = new THRUSTINTARRAY32(nnz); 1848 mat->column_indices->assign(a->j, a->j+nnz); 1849 1850 mat->values = new THRUSTARRAY(nnz); 1851 if (a->a) mat->values->assign(a->a, a->a+nnz); 1852 1853 cusparseHybMat_t hybMat; 1854 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1855 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1856 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1857 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1858 matstruct->descr, mat->values->data().get(), 1859 mat->row_offsets->data().get(), 1860 mat->column_indices->data().get(), 1861 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1862 /* assign the pointer */ 1863 matstruct->mat = hybMat; 1864 1865 if (mat) { 1866 if (mat->values) delete (THRUSTARRAY*)mat->values; 1867 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1868 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1869 delete (CsrMatrix*)mat; 1870 } 1871 #endif 1872 } 1873 1874 /* assign the compressed row indices */ 1875 if (a->compressedrow.use) { 1876 cusparsestruct->workVector = new THRUSTARRAY(m); 1877 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1878 matstruct->cprowIndices->assign(ridx,ridx+m); 1879 tmp = m; 1880 } else { 1881 cusparsestruct->workVector = NULL; 1882 matstruct->cprowIndices = NULL; 1883 tmp = 0; 1884 } 1885 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1886 1887 /* assign the pointer */ 1888 cusparsestruct->mat = matstruct; 1889 } catch(char *ex) { 1890 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1891 } 1892 err = WaitForCUDA();CHKERRCUDA(err); 1893 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1894 cusparsestruct->nonzerostate = A->nonzerostate; 1895 } 1896 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1897 } 1898 PetscFunctionReturn(0); 1899 } 1900 1901 struct VecCUDAPlusEquals 1902 { 1903 template <typename Tuple> 1904 __host__ __device__ 1905 void operator()(Tuple t) 1906 { 1907 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1908 } 1909 }; 1910 1911 struct VecCUDAEquals 1912 { 1913 template <typename Tuple> 1914 __host__ __device__ 1915 void operator()(Tuple t) 1916 { 1917 thrust::get<1>(t) = thrust::get<0>(t); 1918 } 1919 }; 1920 1921 struct VecCUDAEqualsReverse 1922 { 1923 template <typename Tuple> 1924 __host__ __device__ 1925 void operator()(Tuple t) 1926 { 1927 thrust::get<0>(t) = thrust::get<1>(t); 1928 } 1929 }; 1930 1931 struct MatMatCusparse { 1932 PetscBool cisdense; 1933 PetscScalar *Bt; 1934 Mat X; 1935 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1936 PetscLogDouble flops; 1937 CsrMatrix *Bcsr; 1938 1939 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1940 cusparseSpMatDescr_t matSpBDescr; 1941 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1942 cusparseDnMatDescr_t matBDescr; 1943 cusparseDnMatDescr_t matCDescr; 1944 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1945 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 1946 void *dBuffer4; 1947 void *dBuffer5; 1948 #endif 1949 size_t mmBufferSize; 1950 void *mmBuffer; 1951 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1952 cusparseSpGEMMDescr_t spgemmDesc; 1953 #endif 1954 }; 1955 1956 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1957 { 1958 PetscErrorCode ierr; 1959 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1960 cudaError_t cerr; 1961 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1962 cusparseStatus_t stat; 1963 #endif 1964 1965 PetscFunctionBegin; 1966 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1967 delete mmdata->Bcsr; 1968 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1969 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1970 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1971 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1972 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1973 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 1974 if (mmdata->dBuffer4) { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); } 1975 if (mmdata->dBuffer5) { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); } 1976 #endif 1977 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1978 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1979 #endif 1980 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1981 ierr = PetscFree(data);CHKERRQ(ierr); 1982 PetscFunctionReturn(0); 1983 } 1984 1985 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1986 1987 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1988 { 1989 Mat_Product *product = C->product; 1990 Mat A,B; 1991 PetscInt m,n,blda,clda; 1992 PetscBool flg,biscuda; 1993 Mat_SeqAIJCUSPARSE *cusp; 1994 cusparseStatus_t stat; 1995 cusparseOperation_t opA; 1996 const PetscScalar *barray; 1997 PetscScalar *carray; 1998 PetscErrorCode ierr; 1999 MatMatCusparse *mmdata; 2000 Mat_SeqAIJCUSPARSEMultStruct *mat; 2001 CsrMatrix *csrmat; 2002 2003 PetscFunctionBegin; 2004 MatCheckProduct(C,1); 2005 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2006 mmdata = (MatMatCusparse*)product->data; 2007 A = product->A; 2008 B = product->B; 2009 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2010 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2011 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2012 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2013 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2014 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2015 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2016 switch (product->type) { 2017 case MATPRODUCT_AB: 2018 case MATPRODUCT_PtAP: 2019 mat = cusp->mat; 2020 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2021 m = A->rmap->n; 2022 n = B->cmap->n; 2023 break; 2024 case MATPRODUCT_AtB: 2025 if (!A->form_explicit_transpose) { 2026 mat = cusp->mat; 2027 opA = CUSPARSE_OPERATION_TRANSPOSE; 2028 } else { 2029 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2030 mat = cusp->matTranspose; 2031 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2032 } 2033 m = A->cmap->n; 2034 n = B->cmap->n; 2035 break; 2036 case MATPRODUCT_ABt: 2037 case MATPRODUCT_RARt: 2038 mat = cusp->mat; 2039 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2040 m = A->rmap->n; 2041 n = B->rmap->n; 2042 break; 2043 default: 2044 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2045 } 2046 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2047 csrmat = (CsrMatrix*)mat->mat; 2048 /* if the user passed a CPU matrix, copy the data to the GPU */ 2049 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2050 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2051 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2052 2053 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2054 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2055 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2056 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2057 } else { 2058 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2059 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2060 } 2061 2062 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2063 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2064 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2065 /* (re)allcoate mmBuffer if not initialized or LDAs are different */ 2066 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2067 size_t mmBufferSize; 2068 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2069 if (!mmdata->matBDescr) { 2070 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2071 mmdata->Blda = blda; 2072 } 2073 2074 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2075 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2076 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2077 mmdata->Clda = clda; 2078 } 2079 2080 if (!mat->matDescr) { 2081 stat = cusparseCreateCsr(&mat->matDescr, 2082 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2083 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2084 csrmat->values->data().get(), 2085 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2086 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2087 } 2088 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2089 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2090 mmdata->matCDescr,cusparse_scalartype, 2091 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2092 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2093 cudaError_t cerr; 2094 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2095 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2096 mmdata->mmBufferSize = mmBufferSize; 2097 } 2098 mmdata->initialized = PETSC_TRUE; 2099 } else { 2100 /* to be safe, always update pointers of the mats */ 2101 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2102 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2103 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2104 } 2105 2106 /* do cusparseSpMM, which supports transpose on B */ 2107 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2108 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2109 mmdata->matCDescr,cusparse_scalartype, 2110 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2111 #else 2112 PetscInt k; 2113 /* cusparseXcsrmm does not support transpose on B */ 2114 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2115 cublasHandle_t cublasv2handle; 2116 cublasStatus_t cerr; 2117 2118 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2119 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2120 B->cmap->n,B->rmap->n, 2121 &PETSC_CUSPARSE_ONE ,barray,blda, 2122 &PETSC_CUSPARSE_ZERO,barray,blda, 2123 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2124 blda = B->cmap->n; 2125 k = B->cmap->n; 2126 } else { 2127 k = B->rmap->n; 2128 } 2129 2130 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2131 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2132 csrmat->num_entries,mat->alpha_one,mat->descr, 2133 csrmat->values->data().get(), 2134 csrmat->row_offsets->data().get(), 2135 csrmat->column_indices->data().get(), 2136 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2137 carray,clda);CHKERRCUSPARSE(stat); 2138 #endif 2139 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2140 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2141 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2142 if (product->type == MATPRODUCT_RARt) { 2143 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2144 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2145 } else if (product->type == MATPRODUCT_PtAP) { 2146 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2147 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2148 } else { 2149 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2150 } 2151 if (mmdata->cisdense) { 2152 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2153 } 2154 if (!biscuda) { 2155 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2156 } 2157 PetscFunctionReturn(0); 2158 } 2159 2160 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2161 { 2162 Mat_Product *product = C->product; 2163 Mat A,B; 2164 PetscInt m,n; 2165 PetscBool cisdense,flg; 2166 PetscErrorCode ierr; 2167 MatMatCusparse *mmdata; 2168 Mat_SeqAIJCUSPARSE *cusp; 2169 2170 PetscFunctionBegin; 2171 MatCheckProduct(C,1); 2172 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2173 A = product->A; 2174 B = product->B; 2175 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2176 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2177 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2178 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2179 switch (product->type) { 2180 case MATPRODUCT_AB: 2181 m = A->rmap->n; 2182 n = B->cmap->n; 2183 break; 2184 case MATPRODUCT_AtB: 2185 m = A->cmap->n; 2186 n = B->cmap->n; 2187 break; 2188 case MATPRODUCT_ABt: 2189 m = A->rmap->n; 2190 n = B->rmap->n; 2191 break; 2192 case MATPRODUCT_PtAP: 2193 m = B->cmap->n; 2194 n = B->cmap->n; 2195 break; 2196 case MATPRODUCT_RARt: 2197 m = B->rmap->n; 2198 n = B->rmap->n; 2199 break; 2200 default: 2201 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2202 } 2203 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2204 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2205 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2206 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2207 2208 /* product data */ 2209 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2210 mmdata->cisdense = cisdense; 2211 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2212 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2213 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2214 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2215 } 2216 #endif 2217 /* for these products we need intermediate storage */ 2218 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2219 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2220 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2221 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2222 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2223 } else { 2224 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2225 } 2226 } 2227 C->product->data = mmdata; 2228 C->product->destroy = MatDestroy_MatMatCusparse; 2229 2230 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2231 PetscFunctionReturn(0); 2232 } 2233 2234 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2235 { 2236 Mat_Product *product = C->product; 2237 Mat A,B; 2238 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2239 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2240 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2241 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2242 PetscBool flg; 2243 PetscErrorCode ierr; 2244 cusparseStatus_t stat; 2245 cudaError_t cerr; 2246 MatProductType ptype; 2247 MatMatCusparse *mmdata; 2248 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2249 cusparseSpMatDescr_t BmatSpDescr; 2250 #endif 2251 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2252 2253 PetscFunctionBegin; 2254 MatCheckProduct(C,1); 2255 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2256 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2257 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2258 mmdata = (MatMatCusparse*)C->product->data; 2259 A = product->A; 2260 B = product->B; 2261 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2262 mmdata->reusesym = PETSC_FALSE; 2263 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2264 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2265 Cmat = Ccusp->mat; 2266 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2267 Ccsr = (CsrMatrix*)Cmat->mat; 2268 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2269 goto finalize; 2270 } 2271 if (!c->nz) goto finalize; 2272 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2273 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2274 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2275 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2276 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2277 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2278 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2279 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2280 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2281 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2282 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2283 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2284 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2285 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2286 2287 ptype = product->type; 2288 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2289 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2290 switch (ptype) { 2291 case MATPRODUCT_AB: 2292 Amat = Acusp->mat; 2293 Bmat = Bcusp->mat; 2294 break; 2295 case MATPRODUCT_AtB: 2296 Amat = Acusp->matTranspose; 2297 Bmat = Bcusp->mat; 2298 break; 2299 case MATPRODUCT_ABt: 2300 Amat = Acusp->mat; 2301 Bmat = Bcusp->matTranspose; 2302 break; 2303 default: 2304 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2305 } 2306 Cmat = Ccusp->mat; 2307 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2308 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2309 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2310 Acsr = (CsrMatrix*)Amat->mat; 2311 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2312 Ccsr = (CsrMatrix*)Cmat->mat; 2313 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2314 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2315 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2316 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2317 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2318 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2319 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2320 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2321 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2322 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2323 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2324 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2325 #else 2326 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2327 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2328 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2329 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2330 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2331 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2332 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2333 #endif 2334 #else 2335 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2336 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2337 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2338 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2339 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2340 #endif 2341 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2342 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2343 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2344 C->offloadmask = PETSC_OFFLOAD_GPU; 2345 finalize: 2346 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2347 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2348 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2349 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2350 c->reallocs = 0; 2351 C->info.mallocs += 0; 2352 C->info.nz_unneeded = 0; 2353 C->assembled = C->was_assembled = PETSC_TRUE; 2354 C->num_ass++; 2355 PetscFunctionReturn(0); 2356 } 2357 2358 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2359 { 2360 Mat_Product *product = C->product; 2361 Mat A,B; 2362 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2363 Mat_SeqAIJ *a,*b,*c; 2364 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2365 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2366 PetscInt i,j,m,n,k; 2367 PetscBool flg; 2368 PetscErrorCode ierr; 2369 cusparseStatus_t stat; 2370 cudaError_t cerr; 2371 MatProductType ptype; 2372 MatMatCusparse *mmdata; 2373 PetscLogDouble flops; 2374 PetscBool biscompressed,ciscompressed; 2375 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2376 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2377 cusparseSpMatDescr_t BmatSpDescr; 2378 #else 2379 int cnz; 2380 #endif 2381 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2382 2383 PetscFunctionBegin; 2384 MatCheckProduct(C,1); 2385 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2386 A = product->A; 2387 B = product->B; 2388 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2389 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2390 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2391 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2392 a = (Mat_SeqAIJ*)A->data; 2393 b = (Mat_SeqAIJ*)B->data; 2394 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2395 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2396 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2397 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2398 2399 /* product data */ 2400 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2401 C->product->data = mmdata; 2402 C->product->destroy = MatDestroy_MatMatCusparse; 2403 2404 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2405 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2406 ptype = product->type; 2407 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2408 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2409 biscompressed = PETSC_FALSE; 2410 ciscompressed = PETSC_FALSE; 2411 switch (ptype) { 2412 case MATPRODUCT_AB: 2413 m = A->rmap->n; 2414 n = B->cmap->n; 2415 k = A->cmap->n; 2416 Amat = Acusp->mat; 2417 Bmat = Bcusp->mat; 2418 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2419 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2420 break; 2421 case MATPRODUCT_AtB: 2422 m = A->cmap->n; 2423 n = B->cmap->n; 2424 k = A->rmap->n; 2425 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2426 Amat = Acusp->matTranspose; 2427 Bmat = Bcusp->mat; 2428 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2429 break; 2430 case MATPRODUCT_ABt: 2431 m = A->rmap->n; 2432 n = B->rmap->n; 2433 k = A->cmap->n; 2434 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 2435 Amat = Acusp->mat; 2436 Bmat = Bcusp->matTranspose; 2437 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2438 break; 2439 default: 2440 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2441 } 2442 2443 /* create cusparse matrix */ 2444 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2445 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2446 c = (Mat_SeqAIJ*)C->data; 2447 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2448 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2449 Ccsr = new CsrMatrix; 2450 2451 c->compressedrow.use = ciscompressed; 2452 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2453 c->compressedrow.nrows = a->compressedrow.nrows; 2454 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2455 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2456 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2457 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2458 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2459 } else { 2460 c->compressedrow.nrows = 0; 2461 c->compressedrow.i = NULL; 2462 c->compressedrow.rindex = NULL; 2463 Ccusp->workVector = NULL; 2464 Cmat->cprowIndices = NULL; 2465 } 2466 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2467 Ccusp->mat = Cmat; 2468 Ccusp->mat->mat = Ccsr; 2469 Ccsr->num_rows = Ccusp->nrows; 2470 Ccsr->num_cols = n; 2471 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2472 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2473 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2474 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2475 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2476 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2477 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2478 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2479 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2480 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2481 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2482 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2483 c->nz = 0; 2484 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2485 Ccsr->values = new THRUSTARRAY(c->nz); 2486 goto finalizesym; 2487 } 2488 2489 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2490 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2491 Acsr = (CsrMatrix*)Amat->mat; 2492 if (!biscompressed) { 2493 Bcsr = (CsrMatrix*)Bmat->mat; 2494 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2495 BmatSpDescr = Bmat->matDescr; 2496 #endif 2497 } else { /* we need to use row offsets for the full matrix */ 2498 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2499 Bcsr = new CsrMatrix; 2500 Bcsr->num_rows = B->rmap->n; 2501 Bcsr->num_cols = cBcsr->num_cols; 2502 Bcsr->num_entries = cBcsr->num_entries; 2503 Bcsr->column_indices = cBcsr->column_indices; 2504 Bcsr->values = cBcsr->values; 2505 if (!Bcusp->rowoffsets_gpu) { 2506 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2507 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2508 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2509 } 2510 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2511 mmdata->Bcsr = Bcsr; 2512 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2513 if (Bcsr->num_rows && Bcsr->num_cols) { 2514 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2515 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2516 Bcsr->values->data().get(), 2517 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2518 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2519 } 2520 BmatSpDescr = mmdata->matSpBDescr; 2521 #endif 2522 } 2523 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2524 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2525 /* precompute flops count */ 2526 if (ptype == MATPRODUCT_AB) { 2527 for (i=0, flops = 0; i<A->rmap->n; i++) { 2528 const PetscInt st = a->i[i]; 2529 const PetscInt en = a->i[i+1]; 2530 for (j=st; j<en; j++) { 2531 const PetscInt brow = a->j[j]; 2532 flops += 2.*(b->i[brow+1] - b->i[brow]); 2533 } 2534 } 2535 } else if (ptype == MATPRODUCT_AtB) { 2536 for (i=0, flops = 0; i<A->rmap->n; i++) { 2537 const PetscInt anzi = a->i[i+1] - a->i[i]; 2538 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2539 flops += (2.*anzi)*bnzi; 2540 } 2541 } else { /* TODO */ 2542 flops = 0.; 2543 } 2544 2545 mmdata->flops = flops; 2546 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2547 2548 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2549 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2550 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2551 NULL, NULL, NULL, 2552 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2553 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2554 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2555 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2556 { 2557 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2558 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2559 */ 2560 void* dBuffer1 = NULL; 2561 void* dBuffer2 = NULL; 2562 void* dBuffer3 = NULL; 2563 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2564 size_t bufferSize1 = 0; 2565 size_t bufferSize2 = 0; 2566 size_t bufferSize3 = 0; 2567 size_t bufferSize4 = 0; 2568 size_t bufferSize5 = 0; 2569 2570 /*----------------------------------------------------------------------*/ 2571 /* ask bufferSize1 bytes for external memory */ 2572 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2573 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2574 &bufferSize1, NULL);CHKERRCUSPARSE(stat); 2575 cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr); 2576 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2577 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2578 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2579 &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat); 2580 2581 /*----------------------------------------------------------------------*/ 2582 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2583 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2584 &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat); 2585 cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr); 2586 cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr); 2587 cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr); 2588 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2589 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2590 &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat); 2591 cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr); 2592 cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr); 2593 2594 /*----------------------------------------------------------------------*/ 2595 /* get matrix C non-zero entries C_nnz1 */ 2596 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2597 c->nz = (PetscInt) C_nnz1; 2598 /* allocate matrix C */ 2599 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2600 Ccsr->values = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2601 /* update matC with the new pointers */ 2602 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2603 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2604 2605 /*----------------------------------------------------------------------*/ 2606 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2607 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2608 &bufferSize5, NULL);CHKERRCUSPARSE(stat); 2609 cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr); 2610 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2611 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2612 &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat); 2613 cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr); 2614 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2615 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2616 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2617 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2618 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr); 2619 } 2620 #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2621 size_t bufSize2; 2622 /* ask bufferSize bytes for external memory */ 2623 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2624 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2625 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2626 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2627 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2628 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2629 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2630 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2631 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2632 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2633 /* ask bufferSize again bytes for external memory */ 2634 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2635 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2636 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2637 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2638 /* The CUSPARSE documentation is not clear, nor the API 2639 We need both buffers to perform the operations properly! 2640 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2641 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2642 is stored in the descriptor! What a messy API... */ 2643 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2644 /* compute the intermediate product of A * B */ 2645 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2646 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2647 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2648 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2649 /* get matrix C non-zero entries C_nnz1 */ 2650 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2651 c->nz = (PetscInt) C_nnz1; 2652 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2653 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2654 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2655 Ccsr->values = new THRUSTARRAY(c->nz); 2656 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2657 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2658 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2659 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2660 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2661 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2662 #endif 2663 #else 2664 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2665 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2666 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2667 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2668 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2669 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2670 c->nz = cnz; 2671 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2672 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2673 Ccsr->values = new THRUSTARRAY(c->nz); 2674 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2675 2676 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2677 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2678 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2679 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2680 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2681 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2682 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2683 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2684 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2685 #endif 2686 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2687 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2688 finalizesym: 2689 c->singlemalloc = PETSC_FALSE; 2690 c->free_a = PETSC_TRUE; 2691 c->free_ij = PETSC_TRUE; 2692 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2693 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2694 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2695 PetscInt *d_i = c->i; 2696 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2697 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2698 ii = *Ccsr->row_offsets; 2699 jj = *Ccsr->column_indices; 2700 if (ciscompressed) d_i = c->compressedrow.i; 2701 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2702 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2703 } else { 2704 PetscInt *d_i = c->i; 2705 if (ciscompressed) d_i = c->compressedrow.i; 2706 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2707 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2708 } 2709 if (ciscompressed) { /* need to expand host row offsets */ 2710 PetscInt r = 0; 2711 c->i[0] = 0; 2712 for (k = 0; k < c->compressedrow.nrows; k++) { 2713 const PetscInt next = c->compressedrow.rindex[k]; 2714 const PetscInt old = c->compressedrow.i[k]; 2715 for (; r < next; r++) c->i[r+1] = old; 2716 } 2717 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2718 } 2719 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2720 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2721 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2722 c->maxnz = c->nz; 2723 c->nonzerorowcnt = 0; 2724 c->rmax = 0; 2725 for (k = 0; k < m; k++) { 2726 const PetscInt nn = c->i[k+1] - c->i[k]; 2727 c->ilen[k] = c->imax[k] = nn; 2728 c->nonzerorowcnt += (PetscInt)!!nn; 2729 c->rmax = PetscMax(c->rmax,nn); 2730 } 2731 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2732 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2733 Ccsr->num_entries = c->nz; 2734 2735 C->nonzerostate++; 2736 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2737 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2738 Ccusp->nonzerostate = C->nonzerostate; 2739 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2740 C->preallocated = PETSC_TRUE; 2741 C->assembled = PETSC_FALSE; 2742 C->was_assembled = PETSC_FALSE; 2743 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2744 mmdata->reusesym = PETSC_TRUE; 2745 C->offloadmask = PETSC_OFFLOAD_GPU; 2746 } 2747 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2748 PetscFunctionReturn(0); 2749 } 2750 2751 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2752 2753 /* handles sparse or dense B */ 2754 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2755 { 2756 Mat_Product *product = mat->product; 2757 PetscErrorCode ierr; 2758 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2759 2760 PetscFunctionBegin; 2761 MatCheckProduct(mat,1); 2762 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2763 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2764 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2765 } 2766 if (product->type == MATPRODUCT_ABC) { 2767 Ciscusp = PETSC_FALSE; 2768 if (!product->C->boundtocpu) { 2769 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2770 } 2771 } 2772 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 2773 PetscBool usecpu = PETSC_FALSE; 2774 switch (product->type) { 2775 case MATPRODUCT_AB: 2776 if (product->api_user) { 2777 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 2778 ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2779 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2780 } else { 2781 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 2782 ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2783 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2784 } 2785 break; 2786 case MATPRODUCT_AtB: 2787 if (product->api_user) { 2788 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 2789 ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2790 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2791 } else { 2792 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 2793 ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2794 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2795 } 2796 break; 2797 case MATPRODUCT_PtAP: 2798 if (product->api_user) { 2799 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 2800 ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2801 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2802 } else { 2803 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 2804 ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2805 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2806 } 2807 break; 2808 case MATPRODUCT_RARt: 2809 if (product->api_user) { 2810 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 2811 ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2812 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2813 } else { 2814 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 2815 ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2816 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2817 } 2818 break; 2819 case MATPRODUCT_ABC: 2820 if (product->api_user) { 2821 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 2822 ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2823 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2824 } else { 2825 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 2826 ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2827 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2828 } 2829 break; 2830 default: 2831 break; 2832 } 2833 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 2834 } 2835 /* dispatch */ 2836 if (isdense) { 2837 switch (product->type) { 2838 case MATPRODUCT_AB: 2839 case MATPRODUCT_AtB: 2840 case MATPRODUCT_ABt: 2841 case MATPRODUCT_PtAP: 2842 case MATPRODUCT_RARt: 2843 if (product->A->boundtocpu) { 2844 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2845 } else { 2846 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2847 } 2848 break; 2849 case MATPRODUCT_ABC: 2850 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2851 break; 2852 default: 2853 break; 2854 } 2855 } else if (Biscusp && Ciscusp) { 2856 switch (product->type) { 2857 case MATPRODUCT_AB: 2858 case MATPRODUCT_AtB: 2859 case MATPRODUCT_ABt: 2860 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2861 break; 2862 case MATPRODUCT_PtAP: 2863 case MATPRODUCT_RARt: 2864 case MATPRODUCT_ABC: 2865 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2866 break; 2867 default: 2868 break; 2869 } 2870 } else { /* fallback for AIJ */ 2871 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2872 } 2873 PetscFunctionReturn(0); 2874 } 2875 2876 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2877 { 2878 PetscErrorCode ierr; 2879 2880 PetscFunctionBegin; 2881 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2882 PetscFunctionReturn(0); 2883 } 2884 2885 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2886 { 2887 PetscErrorCode ierr; 2888 2889 PetscFunctionBegin; 2890 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2891 PetscFunctionReturn(0); 2892 } 2893 2894 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2895 { 2896 PetscErrorCode ierr; 2897 2898 PetscFunctionBegin; 2899 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2900 PetscFunctionReturn(0); 2901 } 2902 2903 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2904 { 2905 PetscErrorCode ierr; 2906 2907 PetscFunctionBegin; 2908 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2909 PetscFunctionReturn(0); 2910 } 2911 2912 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2913 { 2914 PetscErrorCode ierr; 2915 2916 PetscFunctionBegin; 2917 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2918 PetscFunctionReturn(0); 2919 } 2920 2921 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2922 { 2923 int i = blockIdx.x*blockDim.x + threadIdx.x; 2924 if (i < n) y[idx[i]] += x[i]; 2925 } 2926 2927 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2928 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2929 { 2930 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2931 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2932 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2933 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2934 PetscErrorCode ierr; 2935 cusparseStatus_t stat; 2936 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2937 PetscBool compressed; 2938 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2939 PetscInt nx,ny; 2940 #endif 2941 2942 PetscFunctionBegin; 2943 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 2944 if (!a->nonzerorowcnt) { 2945 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2946 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2947 PetscFunctionReturn(0); 2948 } 2949 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2950 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2951 if (!trans) { 2952 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2953 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2954 } else { 2955 if (herm || !A->form_explicit_transpose) { 2956 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2957 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2958 } else { 2959 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);} 2960 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2961 } 2962 } 2963 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2964 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2965 2966 try { 2967 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2968 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2969 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2970 2971 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2972 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2973 /* z = A x + beta y. 2974 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2975 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2976 */ 2977 xptr = xarray; 2978 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2979 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2980 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2981 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2982 allocated to accommodate different uses. So we get the length info directly from mat. 2983 */ 2984 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2985 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2986 nx = mat->num_cols; 2987 ny = mat->num_rows; 2988 } 2989 #endif 2990 } else { 2991 /* z = A^T x + beta y 2992 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2993 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2994 */ 2995 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2996 dptr = zarray; 2997 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2998 if (compressed) { /* Scatter x to work vector */ 2999 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3000 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3001 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3002 VecCUDAEqualsReverse()); 3003 } 3004 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3005 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3006 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3007 nx = mat->num_rows; 3008 ny = mat->num_cols; 3009 } 3010 #endif 3011 } 3012 3013 /* csr_spmv does y = alpha op(A) x + beta y */ 3014 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3015 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3016 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3017 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3018 cudaError_t cerr; 3019 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3020 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3021 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3022 matstruct->matDescr, 3023 matstruct->cuSpMV[opA].vecXDescr, beta, 3024 matstruct->cuSpMV[opA].vecYDescr, 3025 cusparse_scalartype, 3026 cusparsestruct->spmvAlg, 3027 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 3028 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 3029 3030 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3031 } else { 3032 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3033 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 3034 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 3035 } 3036 3037 stat = cusparseSpMV(cusparsestruct->handle, opA, 3038 matstruct->alpha_one, 3039 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */ 3040 matstruct->cuSpMV[opA].vecXDescr, 3041 beta, 3042 matstruct->cuSpMV[opA].vecYDescr, 3043 cusparse_scalartype, 3044 cusparsestruct->spmvAlg, 3045 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 3046 #else 3047 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3048 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 3049 mat->num_rows, mat->num_cols, 3050 mat->num_entries, matstruct->alpha_one, matstruct->descr, 3051 mat->values->data().get(), mat->row_offsets->data().get(), 3052 mat->column_indices->data().get(), xptr, beta, 3053 dptr);CHKERRCUSPARSE(stat); 3054 #endif 3055 } else { 3056 if (cusparsestruct->nrows) { 3057 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3058 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3059 #else 3060 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3061 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 3062 matstruct->alpha_one, matstruct->descr, hybMat, 3063 xptr, beta, 3064 dptr);CHKERRCUSPARSE(stat); 3065 #endif 3066 } 3067 } 3068 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3069 3070 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3071 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3072 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3073 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 3074 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3075 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3076 } 3077 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3078 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 3079 } 3080 3081 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3082 if (compressed) { 3083 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3084 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3085 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3086 prevent that. So I just add a ScatterAdd kernel. 3087 */ 3088 #if 0 3089 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3090 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3091 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3092 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3093 VecCUDAPlusEquals()); 3094 #else 3095 PetscInt n = matstruct->cprowIndices->size(); 3096 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3097 #endif 3098 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3099 } 3100 } else { 3101 if (yy && yy != zz) { 3102 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3103 } 3104 } 3105 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3106 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 3107 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 3108 } catch(char *ex) { 3109 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3110 } 3111 if (yy) { 3112 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 3113 } else { 3114 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 3115 } 3116 PetscFunctionReturn(0); 3117 } 3118 3119 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3120 { 3121 PetscErrorCode ierr; 3122 3123 PetscFunctionBegin; 3124 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3125 PetscFunctionReturn(0); 3126 } 3127 3128 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3129 { 3130 PetscErrorCode ierr; 3131 PetscObjectState onnz = A->nonzerostate; 3132 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3133 3134 PetscFunctionBegin; 3135 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 3136 if (onnz != A->nonzerostate && cusp->deviceMat) { 3137 cudaError_t cerr; 3138 3139 ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 3140 cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr); 3141 cusp->deviceMat = NULL; 3142 } 3143 PetscFunctionReturn(0); 3144 } 3145 3146 /* --------------------------------------------------------------------------------*/ 3147 /*@ 3148 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3149 (the default parallel PETSc format). This matrix will ultimately pushed down 3150 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3151 assembly performance the user should preallocate the matrix storage by setting 3152 the parameter nz (or the array nnz). By setting these parameters accurately, 3153 performance during matrix assembly can be increased by more than a factor of 50. 3154 3155 Collective 3156 3157 Input Parameters: 3158 + comm - MPI communicator, set to PETSC_COMM_SELF 3159 . m - number of rows 3160 . n - number of columns 3161 . nz - number of nonzeros per row (same for all rows) 3162 - nnz - array containing the number of nonzeros in the various rows 3163 (possibly different for each row) or NULL 3164 3165 Output Parameter: 3166 . A - the matrix 3167 3168 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3169 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3170 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3171 3172 Notes: 3173 If nnz is given then nz is ignored 3174 3175 The AIJ format (also called the Yale sparse matrix format or 3176 compressed row storage), is fully compatible with standard Fortran 77 3177 storage. That is, the stored row and column indices can begin at 3178 either one (as in Fortran) or zero. See the users' manual for details. 3179 3180 Specify the preallocated storage with either nz or nnz (not both). 3181 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3182 allocation. For large problems you MUST preallocate memory or you 3183 will get TERRIBLE performance, see the users' manual chapter on matrices. 3184 3185 By default, this format uses inodes (identical nodes) when possible, to 3186 improve numerical efficiency of matrix-vector products and solves. We 3187 search for consecutive rows with the same nonzero structure, thereby 3188 reusing matrix information to achieve increased efficiency. 3189 3190 Level: intermediate 3191 3192 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3193 @*/ 3194 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3195 { 3196 PetscErrorCode ierr; 3197 3198 PetscFunctionBegin; 3199 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3200 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3201 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3202 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3203 PetscFunctionReturn(0); 3204 } 3205 3206 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3207 { 3208 PetscErrorCode ierr; 3209 3210 PetscFunctionBegin; 3211 if (A->factortype == MAT_FACTOR_NONE) { 3212 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3213 } else { 3214 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3215 } 3216 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3217 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3218 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3219 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3220 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3221 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3222 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3223 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3224 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3225 PetscFunctionReturn(0); 3226 } 3227 3228 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3229 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3230 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3231 { 3232 PetscErrorCode ierr; 3233 3234 PetscFunctionBegin; 3235 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3236 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3237 PetscFunctionReturn(0); 3238 } 3239 3240 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3241 { 3242 PetscErrorCode ierr; 3243 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3244 Mat_SeqAIJCUSPARSE *cy; 3245 Mat_SeqAIJCUSPARSE *cx; 3246 PetscScalar *ay; 3247 const PetscScalar *ax; 3248 CsrMatrix *csry,*csrx; 3249 3250 PetscFunctionBegin; 3251 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3252 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3253 if (X->ops->axpy != Y->ops->axpy) { 3254 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3255 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3256 PetscFunctionReturn(0); 3257 } 3258 /* if we are here, it means both matrices are bound to GPU */ 3259 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3260 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3261 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3262 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3263 csry = (CsrMatrix*)cy->mat->mat; 3264 csrx = (CsrMatrix*)cx->mat->mat; 3265 /* see if we can turn this into a cublas axpy */ 3266 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3267 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3268 if (eq) { 3269 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3270 } 3271 if (eq) str = SAME_NONZERO_PATTERN; 3272 } 3273 /* spgeam is buggy with one column */ 3274 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3275 3276 if (str == SUBSET_NONZERO_PATTERN) { 3277 cusparseStatus_t stat; 3278 PetscScalar b = 1.0; 3279 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3280 size_t bufferSize; 3281 void *buffer; 3282 cudaError_t cerr; 3283 #endif 3284 3285 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3286 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3287 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3288 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3289 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3290 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3291 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3292 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3293 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3294 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3295 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3296 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3297 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3298 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3299 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3300 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3301 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3302 #else 3303 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3304 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3305 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3306 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3307 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3308 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3309 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3310 #endif 3311 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3312 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3313 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3314 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3315 } else if (str == SAME_NONZERO_PATTERN) { 3316 cublasHandle_t cublasv2handle; 3317 cublasStatus_t berr; 3318 PetscBLASInt one = 1, bnz = 1; 3319 3320 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3321 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3322 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3323 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3324 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3325 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3326 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3327 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3328 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3329 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3330 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3331 } else { 3332 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3333 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3334 } 3335 PetscFunctionReturn(0); 3336 } 3337 3338 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3339 { 3340 PetscErrorCode ierr; 3341 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3342 PetscScalar *ay; 3343 cublasHandle_t cublasv2handle; 3344 cublasStatus_t berr; 3345 PetscBLASInt one = 1, bnz = 1; 3346 3347 PetscFunctionBegin; 3348 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3349 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3350 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3351 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3352 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3353 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3354 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3355 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3356 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3357 PetscFunctionReturn(0); 3358 } 3359 3360 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3361 { 3362 PetscErrorCode ierr; 3363 PetscBool both = PETSC_FALSE; 3364 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3365 3366 PetscFunctionBegin; 3367 if (A->factortype == MAT_FACTOR_NONE) { 3368 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3369 if (spptr->mat) { 3370 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3371 if (matrix->values) { 3372 both = PETSC_TRUE; 3373 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3374 } 3375 } 3376 if (spptr->matTranspose) { 3377 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3378 if (matrix->values) { 3379 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3380 } 3381 } 3382 } 3383 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3384 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3385 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3386 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3387 else A->offloadmask = PETSC_OFFLOAD_CPU; 3388 PetscFunctionReturn(0); 3389 } 3390 3391 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3392 { 3393 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3394 PetscErrorCode ierr; 3395 3396 PetscFunctionBegin; 3397 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3398 if (flg) { 3399 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3400 3401 A->ops->scale = MatScale_SeqAIJ; 3402 A->ops->axpy = MatAXPY_SeqAIJ; 3403 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3404 A->ops->mult = MatMult_SeqAIJ; 3405 A->ops->multadd = MatMultAdd_SeqAIJ; 3406 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3407 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3408 A->ops->multhermitiantranspose = NULL; 3409 A->ops->multhermitiantransposeadd = NULL; 3410 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3411 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3412 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3413 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3414 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3415 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3416 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3417 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3418 } else { 3419 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3420 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3421 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3422 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3423 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3424 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3425 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3426 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3427 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3428 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3429 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3430 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3431 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3432 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3433 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3434 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3435 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3436 } 3437 A->boundtocpu = flg; 3438 a->inode.use = flg; 3439 PetscFunctionReturn(0); 3440 } 3441 3442 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3443 { 3444 PetscErrorCode ierr; 3445 cusparseStatus_t stat; 3446 Mat B; 3447 3448 PetscFunctionBegin; 3449 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3450 if (reuse == MAT_INITIAL_MATRIX) { 3451 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3452 } else if (reuse == MAT_REUSE_MATRIX) { 3453 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3454 } 3455 B = *newmat; 3456 3457 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3458 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3459 3460 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3461 if (B->factortype == MAT_FACTOR_NONE) { 3462 Mat_SeqAIJCUSPARSE *spptr; 3463 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3464 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3465 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3466 spptr->format = MAT_CUSPARSE_CSR; 3467 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3468 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3469 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3470 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3471 #endif 3472 B->spptr = spptr; 3473 } else { 3474 Mat_SeqAIJCUSPARSETriFactors *spptr; 3475 3476 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3477 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3478 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3479 B->spptr = spptr; 3480 } 3481 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3482 } 3483 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3484 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3485 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3486 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3487 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3488 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3489 3490 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3491 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3492 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3493 PetscFunctionReturn(0); 3494 } 3495 3496 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3497 { 3498 PetscErrorCode ierr; 3499 3500 PetscFunctionBegin; 3501 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3502 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3503 PetscFunctionReturn(0); 3504 } 3505 3506 /*MC 3507 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3508 3509 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3510 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3511 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3512 3513 Options Database Keys: 3514 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3515 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3516 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3517 3518 Level: beginner 3519 3520 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3521 M*/ 3522 3523 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3524 3525 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3526 { 3527 PetscErrorCode ierr; 3528 3529 PetscFunctionBegin; 3530 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 3531 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3532 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3533 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3534 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3535 3536 PetscFunctionReturn(0); 3537 } 3538 3539 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3540 { 3541 PetscErrorCode ierr; 3542 cusparseStatus_t stat; 3543 3544 PetscFunctionBegin; 3545 if (*cusparsestruct) { 3546 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3547 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3548 delete (*cusparsestruct)->workVector; 3549 delete (*cusparsestruct)->rowoffsets_gpu; 3550 delete (*cusparsestruct)->cooPerm; 3551 delete (*cusparsestruct)->cooPerm_a; 3552 delete (*cusparsestruct)->csr2csc_i; 3553 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3554 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3555 } 3556 PetscFunctionReturn(0); 3557 } 3558 3559 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3560 { 3561 PetscFunctionBegin; 3562 if (*mat) { 3563 delete (*mat)->values; 3564 delete (*mat)->column_indices; 3565 delete (*mat)->row_offsets; 3566 delete *mat; 3567 *mat = 0; 3568 } 3569 PetscFunctionReturn(0); 3570 } 3571 3572 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3573 { 3574 cusparseStatus_t stat; 3575 PetscErrorCode ierr; 3576 3577 PetscFunctionBegin; 3578 if (*trifactor) { 3579 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3580 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3581 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3582 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3583 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3584 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3585 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3586 #endif 3587 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3588 } 3589 PetscFunctionReturn(0); 3590 } 3591 3592 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3593 { 3594 CsrMatrix *mat; 3595 cusparseStatus_t stat; 3596 cudaError_t err; 3597 3598 PetscFunctionBegin; 3599 if (*matstruct) { 3600 if ((*matstruct)->mat) { 3601 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3602 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3603 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3604 #else 3605 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3606 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3607 #endif 3608 } else { 3609 mat = (CsrMatrix*)(*matstruct)->mat; 3610 CsrMatrix_Destroy(&mat); 3611 } 3612 } 3613 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3614 delete (*matstruct)->cprowIndices; 3615 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3616 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3617 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3618 3619 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3620 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3621 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3622 for (int i=0; i<3; i++) { 3623 if (mdata->cuSpMV[i].initialized) { 3624 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3625 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3626 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3627 } 3628 } 3629 #endif 3630 delete *matstruct; 3631 *matstruct = NULL; 3632 } 3633 PetscFunctionReturn(0); 3634 } 3635 3636 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3637 { 3638 PetscErrorCode ierr; 3639 3640 PetscFunctionBegin; 3641 if (*trifactors) { 3642 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3643 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3644 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3645 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3646 delete (*trifactors)->rpermIndices; 3647 delete (*trifactors)->cpermIndices; 3648 delete (*trifactors)->workVector; 3649 (*trifactors)->rpermIndices = NULL; 3650 (*trifactors)->cpermIndices = NULL; 3651 (*trifactors)->workVector = NULL; 3652 if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3653 if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3654 (*trifactors)->init_dev_prop = PETSC_FALSE; 3655 } 3656 PetscFunctionReturn(0); 3657 } 3658 3659 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3660 { 3661 PetscErrorCode ierr; 3662 cusparseHandle_t handle; 3663 cusparseStatus_t stat; 3664 3665 PetscFunctionBegin; 3666 if (*trifactors) { 3667 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3668 if (handle = (*trifactors)->handle) { 3669 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3670 } 3671 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3672 } 3673 PetscFunctionReturn(0); 3674 } 3675 3676 struct IJCompare 3677 { 3678 __host__ __device__ 3679 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3680 { 3681 if (t1.get<0>() < t2.get<0>()) return true; 3682 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3683 return false; 3684 } 3685 }; 3686 3687 struct IJEqual 3688 { 3689 __host__ __device__ 3690 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3691 { 3692 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3693 return true; 3694 } 3695 }; 3696 3697 struct IJDiff 3698 { 3699 __host__ __device__ 3700 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3701 { 3702 return t1 == t2 ? 0 : 1; 3703 } 3704 }; 3705 3706 struct IJSum 3707 { 3708 __host__ __device__ 3709 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3710 { 3711 return t1||t2; 3712 } 3713 }; 3714 3715 #include <thrust/iterator/discard_iterator.h> 3716 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3717 { 3718 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3719 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3720 THRUSTARRAY *cooPerm_v = NULL; 3721 thrust::device_ptr<const PetscScalar> d_v; 3722 CsrMatrix *matrix; 3723 PetscErrorCode ierr; 3724 PetscInt n; 3725 3726 PetscFunctionBegin; 3727 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3728 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3729 if (!cusp->cooPerm) { 3730 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3731 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3732 PetscFunctionReturn(0); 3733 } 3734 matrix = (CsrMatrix*)cusp->mat->mat; 3735 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3736 if (!v) { 3737 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3738 goto finalize; 3739 } 3740 n = cusp->cooPerm->size(); 3741 if (isCudaMem(v)) { 3742 d_v = thrust::device_pointer_cast(v); 3743 } else { 3744 cooPerm_v = new THRUSTARRAY(n); 3745 cooPerm_v->assign(v,v+n); 3746 d_v = cooPerm_v->data(); 3747 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3748 } 3749 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3750 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3751 if (cusp->cooPerm_a) { 3752 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3753 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3754 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3755 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3756 delete cooPerm_w; 3757 } else { 3758 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3759 matrix->values->begin())); 3760 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3761 matrix->values->end())); 3762 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 3763 } 3764 } else { 3765 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3766 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3767 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3768 } else { 3769 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3770 matrix->values->begin())); 3771 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3772 matrix->values->end())); 3773 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3774 } 3775 } 3776 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3777 finalize: 3778 delete cooPerm_v; 3779 A->offloadmask = PETSC_OFFLOAD_GPU; 3780 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3781 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3782 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3783 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3784 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3785 a->reallocs = 0; 3786 A->info.mallocs += 0; 3787 A->info.nz_unneeded = 0; 3788 A->assembled = A->was_assembled = PETSC_TRUE; 3789 A->num_ass++; 3790 PetscFunctionReturn(0); 3791 } 3792 3793 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3794 { 3795 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3796 PetscErrorCode ierr; 3797 3798 PetscFunctionBegin; 3799 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3800 if (!cusp) PetscFunctionReturn(0); 3801 if (destroy) { 3802 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3803 delete cusp->csr2csc_i; 3804 cusp->csr2csc_i = NULL; 3805 } 3806 A->transupdated = PETSC_FALSE; 3807 PetscFunctionReturn(0); 3808 } 3809 3810 #include <thrust/binary_search.h> 3811 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3812 { 3813 PetscErrorCode ierr; 3814 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3815 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3816 PetscInt cooPerm_n, nzr = 0; 3817 cudaError_t cerr; 3818 3819 PetscFunctionBegin; 3820 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3821 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3822 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3823 if (n != cooPerm_n) { 3824 delete cusp->cooPerm; 3825 delete cusp->cooPerm_a; 3826 cusp->cooPerm = NULL; 3827 cusp->cooPerm_a = NULL; 3828 } 3829 if (n) { 3830 THRUSTINTARRAY d_i(n); 3831 THRUSTINTARRAY d_j(n); 3832 THRUSTINTARRAY ii(A->rmap->n); 3833 3834 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3835 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3836 3837 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3838 d_i.assign(coo_i,coo_i+n); 3839 d_j.assign(coo_j,coo_j+n); 3840 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3841 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3842 3843 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3844 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3845 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 3846 *cusp->cooPerm_a = d_i; 3847 THRUSTINTARRAY w = d_j; 3848 3849 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 3850 if (nekey == ekey) { /* all entries are unique */ 3851 delete cusp->cooPerm_a; 3852 cusp->cooPerm_a = NULL; 3853 } else { /* I couldn't come up with a more elegant algorithm */ 3854 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 3855 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 3856 (*cusp->cooPerm_a)[0] = 0; 3857 w[0] = 0; 3858 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 3859 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 3860 } 3861 thrust::counting_iterator<PetscInt> search_begin(0); 3862 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 3863 search_begin, search_begin + A->rmap->n, 3864 ii.begin()); 3865 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3866 3867 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3868 a->singlemalloc = PETSC_FALSE; 3869 a->free_a = PETSC_TRUE; 3870 a->free_ij = PETSC_TRUE; 3871 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3872 a->i[0] = 0; 3873 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3874 a->nz = a->maxnz = a->i[A->rmap->n]; 3875 a->rmax = 0; 3876 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3877 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3878 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3879 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3880 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3881 for (PetscInt i = 0; i < A->rmap->n; i++) { 3882 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3883 nzr += (PetscInt)!!(nnzr); 3884 a->ilen[i] = a->imax[i] = nnzr; 3885 a->rmax = PetscMax(a->rmax,nnzr); 3886 } 3887 a->nonzerorowcnt = nzr; 3888 A->preallocated = PETSC_TRUE; 3889 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3890 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3891 } else { 3892 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3893 } 3894 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3895 3896 /* We want to allocate the CUSPARSE struct for matvec now. 3897 The code is so convoluted now that I prefer to copy zeros */ 3898 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3899 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3900 A->offloadmask = PETSC_OFFLOAD_CPU; 3901 A->nonzerostate++; 3902 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3903 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 3904 3905 A->assembled = PETSC_FALSE; 3906 A->was_assembled = PETSC_FALSE; 3907 PetscFunctionReturn(0); 3908 } 3909 3910 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 3911 { 3912 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3913 CsrMatrix *csr; 3914 PetscErrorCode ierr; 3915 3916 PetscFunctionBegin; 3917 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3918 PetscValidPointer(a,2); 3919 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3920 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3921 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3922 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3923 csr = (CsrMatrix*)cusp->mat->mat; 3924 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3925 *a = csr->values->data().get(); 3926 PetscFunctionReturn(0); 3927 } 3928 3929 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 3930 { 3931 PetscFunctionBegin; 3932 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3933 PetscValidPointer(a,2); 3934 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3935 *a = NULL; 3936 PetscFunctionReturn(0); 3937 } 3938 3939 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 3940 { 3941 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3942 CsrMatrix *csr; 3943 PetscErrorCode ierr; 3944 3945 PetscFunctionBegin; 3946 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3947 PetscValidPointer(a,2); 3948 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3949 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3950 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3951 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3952 csr = (CsrMatrix*)cusp->mat->mat; 3953 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3954 *a = csr->values->data().get(); 3955 A->offloadmask = PETSC_OFFLOAD_GPU; 3956 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3957 PetscFunctionReturn(0); 3958 } 3959 3960 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 3961 { 3962 PetscErrorCode ierr; 3963 3964 PetscFunctionBegin; 3965 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3966 PetscValidPointer(a,2); 3967 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3968 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3969 *a = NULL; 3970 PetscFunctionReturn(0); 3971 } 3972 3973 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 3974 { 3975 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3976 CsrMatrix *csr; 3977 PetscErrorCode ierr; 3978 3979 PetscFunctionBegin; 3980 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3981 PetscValidPointer(a,2); 3982 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3983 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3984 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3985 csr = (CsrMatrix*)cusp->mat->mat; 3986 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3987 *a = csr->values->data().get(); 3988 A->offloadmask = PETSC_OFFLOAD_GPU; 3989 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3990 PetscFunctionReturn(0); 3991 } 3992 3993 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 3994 { 3995 PetscErrorCode ierr; 3996 3997 PetscFunctionBegin; 3998 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3999 PetscValidPointer(a,2); 4000 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4001 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4002 *a = NULL; 4003 PetscFunctionReturn(0); 4004 } 4005 4006 struct IJCompare4 4007 { 4008 __host__ __device__ 4009 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4010 { 4011 if (t1.get<0>() < t2.get<0>()) return true; 4012 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4013 return false; 4014 } 4015 }; 4016 4017 struct Shift 4018 { 4019 int _shift; 4020 4021 Shift(int shift) : _shift(shift) {} 4022 __host__ __device__ 4023 inline int operator() (const int &c) 4024 { 4025 return c + _shift; 4026 } 4027 }; 4028 4029 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */ 4030 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4031 { 4032 PetscErrorCode ierr; 4033 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4034 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4035 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4036 CsrMatrix *Acsr,*Bcsr,*Ccsr; 4037 PetscInt Annz,Bnnz; 4038 cusparseStatus_t stat; 4039 PetscInt i,m,n,zero = 0; 4040 cudaError_t cerr; 4041 4042 PetscFunctionBegin; 4043 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4044 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4045 PetscValidPointer(C,4); 4046 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4047 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 4048 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 4049 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 4050 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4051 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4052 if (reuse == MAT_INITIAL_MATRIX) { 4053 m = A->rmap->n; 4054 n = A->cmap->n + B->cmap->n; 4055 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 4056 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 4057 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 4058 c = (Mat_SeqAIJ*)(*C)->data; 4059 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4060 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4061 Ccsr = new CsrMatrix; 4062 Cmat->cprowIndices = NULL; 4063 c->compressedrow.use = PETSC_FALSE; 4064 c->compressedrow.nrows = 0; 4065 c->compressedrow.i = NULL; 4066 c->compressedrow.rindex = NULL; 4067 Ccusp->workVector = NULL; 4068 Ccusp->nrows = m; 4069 Ccusp->mat = Cmat; 4070 Ccusp->mat->mat = Ccsr; 4071 Ccsr->num_rows = m; 4072 Ccsr->num_cols = n; 4073 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 4074 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4075 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4076 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4077 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4078 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4079 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4080 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4081 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4082 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4083 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4084 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 4085 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 4086 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4087 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4088 4089 Acsr = (CsrMatrix*)Acusp->mat->mat; 4090 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4091 Annz = (PetscInt)Acsr->column_indices->size(); 4092 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4093 c->nz = Annz + Bnnz; 4094 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4095 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4096 Ccsr->values = new THRUSTARRAY(c->nz); 4097 Ccsr->num_entries = c->nz; 4098 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4099 if (c->nz) { 4100 auto Acoo = new THRUSTINTARRAY32(Annz); 4101 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4102 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4103 THRUSTINTARRAY32 *Aroff,*Broff; 4104 4105 if (a->compressedrow.use) { /* need full row offset */ 4106 if (!Acusp->rowoffsets_gpu) { 4107 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4108 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4109 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4110 } 4111 Aroff = Acusp->rowoffsets_gpu; 4112 } else Aroff = Acsr->row_offsets; 4113 if (b->compressedrow.use) { /* need full row offset */ 4114 if (!Bcusp->rowoffsets_gpu) { 4115 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4116 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4117 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4118 } 4119 Broff = Bcusp->rowoffsets_gpu; 4120 } else Broff = Bcsr->row_offsets; 4121 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4122 stat = cusparseXcsr2coo(Acusp->handle, 4123 Aroff->data().get(), 4124 Annz, 4125 m, 4126 Acoo->data().get(), 4127 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4128 stat = cusparseXcsr2coo(Bcusp->handle, 4129 Broff->data().get(), 4130 Bnnz, 4131 m, 4132 Bcoo->data().get(), 4133 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4134 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4135 auto Aperm = thrust::make_constant_iterator(1); 4136 auto Bperm = thrust::make_constant_iterator(0); 4137 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4138 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4139 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4140 #else 4141 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4142 auto Bcib = Bcsr->column_indices->begin(); 4143 auto Bcie = Bcsr->column_indices->end(); 4144 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4145 #endif 4146 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4147 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4148 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4149 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4150 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4151 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4152 auto p1 = Ccusp->cooPerm->begin(); 4153 auto p2 = Ccusp->cooPerm->begin(); 4154 thrust::advance(p2,Annz); 4155 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4156 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4157 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4158 #endif 4159 auto cci = thrust::make_counting_iterator(zero); 4160 auto cce = thrust::make_counting_iterator(c->nz); 4161 #if 0 //Errors on SUMMIT cuda 11.1.0 4162 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4163 #else 4164 auto pred = thrust::identity<int>(); 4165 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4166 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4167 #endif 4168 stat = cusparseXcoo2csr(Ccusp->handle, 4169 Ccoo->data().get(), 4170 c->nz, 4171 m, 4172 Ccsr->row_offsets->data().get(), 4173 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4174 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4175 delete wPerm; 4176 delete Acoo; 4177 delete Bcoo; 4178 delete Ccoo; 4179 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4180 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4181 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4182 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4183 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4184 #endif 4185 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4186 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4187 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4188 CsrMatrix *CcsrT = new CsrMatrix; 4189 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4190 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4191 4192 (*C)->form_explicit_transpose = PETSC_TRUE; 4193 (*C)->transupdated = PETSC_TRUE; 4194 Ccusp->rowoffsets_gpu = NULL; 4195 CmatT->cprowIndices = NULL; 4196 CmatT->mat = CcsrT; 4197 CcsrT->num_rows = n; 4198 CcsrT->num_cols = m; 4199 CcsrT->num_entries = c->nz; 4200 4201 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4202 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4203 CcsrT->values = new THRUSTARRAY(c->nz); 4204 4205 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4206 auto rT = CcsrT->row_offsets->begin(); 4207 if (AT) { 4208 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4209 thrust::advance(rT,-1); 4210 } 4211 if (BT) { 4212 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4213 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4214 thrust::copy(titb,tite,rT); 4215 } 4216 auto cT = CcsrT->column_indices->begin(); 4217 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4218 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4219 auto vT = CcsrT->values->begin(); 4220 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4221 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4222 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4223 4224 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4225 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4226 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4227 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4228 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4229 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4230 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4231 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4232 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4233 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4234 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4235 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4236 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4237 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4238 #endif 4239 Ccusp->matTranspose = CmatT; 4240 } 4241 } 4242 4243 c->singlemalloc = PETSC_FALSE; 4244 c->free_a = PETSC_TRUE; 4245 c->free_ij = PETSC_TRUE; 4246 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4247 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4248 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4249 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4250 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4251 ii = *Ccsr->row_offsets; 4252 jj = *Ccsr->column_indices; 4253 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4254 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4255 } else { 4256 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4257 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4258 } 4259 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4260 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4261 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4262 c->maxnz = c->nz; 4263 c->nonzerorowcnt = 0; 4264 c->rmax = 0; 4265 for (i = 0; i < m; i++) { 4266 const PetscInt nn = c->i[i+1] - c->i[i]; 4267 c->ilen[i] = c->imax[i] = nn; 4268 c->nonzerorowcnt += (PetscInt)!!nn; 4269 c->rmax = PetscMax(c->rmax,nn); 4270 } 4271 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4272 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4273 (*C)->nonzerostate++; 4274 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4275 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4276 Ccusp->nonzerostate = (*C)->nonzerostate; 4277 (*C)->preallocated = PETSC_TRUE; 4278 } else { 4279 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4280 c = (Mat_SeqAIJ*)(*C)->data; 4281 if (c->nz) { 4282 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4283 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4284 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4285 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4286 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4287 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4288 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4289 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4290 Acsr = (CsrMatrix*)Acusp->mat->mat; 4291 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4292 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4293 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4294 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4295 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4296 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4297 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4298 auto pmid = Ccusp->cooPerm->begin(); 4299 thrust::advance(pmid,Acsr->num_entries); 4300 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4301 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4302 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4303 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4304 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4305 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4306 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4307 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4308 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4309 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4310 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4311 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4312 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4313 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4314 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4315 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4316 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4317 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4318 auto vT = CcsrT->values->begin(); 4319 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4320 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4321 (*C)->transupdated = PETSC_TRUE; 4322 } 4323 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4324 } 4325 } 4326 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4327 (*C)->assembled = PETSC_TRUE; 4328 (*C)->was_assembled = PETSC_FALSE; 4329 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4330 PetscFunctionReturn(0); 4331 } 4332 4333 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4334 { 4335 PetscErrorCode ierr; 4336 bool dmem; 4337 const PetscScalar *av; 4338 cudaError_t cerr; 4339 4340 PetscFunctionBegin; 4341 dmem = isCudaMem(v); 4342 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4343 if (n && idx) { 4344 THRUSTINTARRAY widx(n); 4345 widx.assign(idx,idx+n); 4346 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4347 4348 THRUSTARRAY *w = NULL; 4349 thrust::device_ptr<PetscScalar> dv; 4350 if (dmem) { 4351 dv = thrust::device_pointer_cast(v); 4352 } else { 4353 w = new THRUSTARRAY(n); 4354 dv = w->data(); 4355 } 4356 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4357 4358 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4359 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4360 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4361 if (w) { 4362 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4363 } 4364 delete w; 4365 } else { 4366 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4367 } 4368 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4369 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4370 PetscFunctionReturn(0); 4371 } 4372