1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/async/for_each.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 76 77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 82 83 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 85 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 86 87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 89 90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 91 92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 93 { 94 cusparseStatus_t stat; 95 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 96 97 PetscFunctionBegin; 98 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 99 cusparsestruct->stream = stream; 100 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 101 PetscFunctionReturn(0); 102 } 103 104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 105 { 106 cusparseStatus_t stat; 107 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 108 109 PetscFunctionBegin; 110 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 111 if (cusparsestruct->handle != handle) { 112 if (cusparsestruct->handle) { 113 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 114 } 115 cusparsestruct->handle = handle; 116 } 117 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 118 PetscFunctionReturn(0); 119 } 120 121 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 122 { 123 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 124 PetscBool flg; 125 PetscErrorCode ierr; 126 127 PetscFunctionBegin; 128 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 129 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 130 if (cusparsestruct->handle) cusparsestruct->handle = 0; 131 PetscFunctionReturn(0); 132 } 133 134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 135 { 136 PetscFunctionBegin; 137 *type = MATSOLVERCUSPARSE; 138 PetscFunctionReturn(0); 139 } 140 141 /*MC 142 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 143 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 144 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 145 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 146 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 147 algorithms are not recommended. This class does NOT support direct solver operations. 148 149 Level: beginner 150 151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 152 M*/ 153 154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 155 { 156 PetscErrorCode ierr; 157 PetscInt n = A->rmap->n; 158 159 PetscFunctionBegin; 160 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 161 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 162 (*B)->factortype = ftype; 163 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 164 165 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 166 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 167 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 168 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 169 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 170 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 171 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 172 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 173 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 174 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 175 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 176 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 177 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 178 179 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 180 (*B)->canuseordering = PETSC_TRUE; 181 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 182 PetscFunctionReturn(0); 183 } 184 185 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 186 { 187 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 188 189 PetscFunctionBegin; 190 switch (op) { 191 case MAT_CUSPARSE_MULT: 192 cusparsestruct->format = format; 193 break; 194 case MAT_CUSPARSE_ALL: 195 cusparsestruct->format = format; 196 break; 197 default: 198 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 199 } 200 PetscFunctionReturn(0); 201 } 202 203 /*@ 204 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 205 operation. Only the MatMult operation can use different GPU storage formats 206 for MPIAIJCUSPARSE matrices. 207 Not Collective 208 209 Input Parameters: 210 + A - Matrix of type SEQAIJCUSPARSE 211 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 212 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 213 214 Output Parameter: 215 216 Level: intermediate 217 218 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 219 @*/ 220 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 221 { 222 PetscErrorCode ierr; 223 224 PetscFunctionBegin; 225 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 226 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 227 PetscFunctionReturn(0); 228 } 229 230 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 231 { 232 PetscErrorCode ierr; 233 234 PetscFunctionBegin; 235 switch (op) { 236 case MAT_FORM_EXPLICIT_TRANSPOSE: 237 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 238 if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 239 A->form_explicit_transpose = flg; 240 break; 241 default: 242 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 243 break; 244 } 245 PetscFunctionReturn(0); 246 } 247 248 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 249 250 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 251 { 252 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 253 IS isrow = b->row,iscol = b->col; 254 PetscBool row_identity,col_identity; 255 PetscErrorCode ierr; 256 257 PetscFunctionBegin; 258 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 259 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 260 B->offloadmask = PETSC_OFFLOAD_CPU; 261 /* determine which version of MatSolve needs to be used. */ 262 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 263 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 264 if (row_identity && col_identity) { 265 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 266 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 267 B->ops->matsolve = NULL; 268 B->ops->matsolvetranspose = NULL; 269 } else { 270 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 271 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 272 B->ops->matsolve = NULL; 273 B->ops->matsolvetranspose = NULL; 274 } 275 276 /* get the triangular factors */ 277 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 278 PetscFunctionReturn(0); 279 } 280 281 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 282 { 283 PetscErrorCode ierr; 284 MatCUSPARSEStorageFormat format; 285 PetscBool flg; 286 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 287 288 PetscFunctionBegin; 289 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 290 if (A->factortype == MAT_FACTOR_NONE) { 291 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 292 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 293 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 294 295 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 296 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 297 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 298 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 299 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 300 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 301 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 302 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 303 304 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 305 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 306 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 307 308 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 309 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 310 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 311 #endif 312 } 313 ierr = PetscOptionsTail();CHKERRQ(ierr); 314 PetscFunctionReturn(0); 315 } 316 317 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 318 { 319 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 320 PetscErrorCode ierr; 321 322 PetscFunctionBegin; 323 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 324 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 325 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 326 PetscFunctionReturn(0); 327 } 328 329 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 330 { 331 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 332 PetscErrorCode ierr; 333 334 PetscFunctionBegin; 335 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 336 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 337 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 338 PetscFunctionReturn(0); 339 } 340 341 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 342 { 343 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 344 PetscErrorCode ierr; 345 346 PetscFunctionBegin; 347 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 348 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 349 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 350 PetscFunctionReturn(0); 351 } 352 353 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 354 { 355 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 356 PetscErrorCode ierr; 357 358 PetscFunctionBegin; 359 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 360 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 361 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 362 PetscFunctionReturn(0); 363 } 364 365 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 366 { 367 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 368 PetscInt n = A->rmap->n; 369 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 370 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 371 cusparseStatus_t stat; 372 const PetscInt *ai = a->i,*aj = a->j,*vi; 373 const MatScalar *aa = a->a,*v; 374 PetscInt *AiLo, *AjLo; 375 PetscInt i,nz, nzLower, offset, rowOffset; 376 PetscErrorCode ierr; 377 cudaError_t cerr; 378 379 PetscFunctionBegin; 380 if (!n) PetscFunctionReturn(0); 381 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 382 try { 383 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 384 nzLower=n+ai[n]-ai[1]; 385 if (!loTriFactor) { 386 PetscScalar *AALo; 387 388 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 389 390 /* Allocate Space for the lower triangular matrix */ 391 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 392 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 393 394 /* Fill the lower triangular matrix */ 395 AiLo[0] = (PetscInt) 0; 396 AiLo[n] = nzLower; 397 AjLo[0] = (PetscInt) 0; 398 AALo[0] = (MatScalar) 1.0; 399 v = aa; 400 vi = aj; 401 offset = 1; 402 rowOffset= 1; 403 for (i=1; i<n; i++) { 404 nz = ai[i+1] - ai[i]; 405 /* additional 1 for the term on the diagonal */ 406 AiLo[i] = rowOffset; 407 rowOffset += nz+1; 408 409 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 410 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 411 412 offset += nz; 413 AjLo[offset] = (PetscInt) i; 414 AALo[offset] = (MatScalar) 1.0; 415 offset += 1; 416 417 v += nz; 418 vi += nz; 419 } 420 421 /* allocate space for the triangular factor information */ 422 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 423 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 424 /* Create the matrix description */ 425 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 426 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 427 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 428 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 429 #else 430 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 431 #endif 432 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 433 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 434 435 /* set the operation */ 436 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 437 438 /* set the matrix */ 439 loTriFactor->csrMat = new CsrMatrix; 440 loTriFactor->csrMat->num_rows = n; 441 loTriFactor->csrMat->num_cols = n; 442 loTriFactor->csrMat->num_entries = nzLower; 443 444 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 445 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 446 447 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 448 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 449 450 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 451 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 452 453 /* Create the solve analysis information */ 454 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 455 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 456 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 457 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 458 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 459 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 460 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 461 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 462 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 463 #endif 464 465 /* perform the solve analysis */ 466 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 467 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 468 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 469 loTriFactor->csrMat->column_indices->data().get(), 470 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 471 loTriFactor->solveInfo, 472 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 473 #else 474 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 475 #endif 476 cerr = WaitForCUDA();CHKERRCUDA(cerr); 477 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 478 479 /* assign the pointer */ 480 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 481 loTriFactor->AA_h = AALo; 482 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 483 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 484 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 485 } else { /* update values only */ 486 if (!loTriFactor->AA_h) { 487 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 488 } 489 /* Fill the lower triangular matrix */ 490 loTriFactor->AA_h[0] = 1.0; 491 v = aa; 492 vi = aj; 493 offset = 1; 494 for (i=1; i<n; i++) { 495 nz = ai[i+1] - ai[i]; 496 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 497 offset += nz; 498 loTriFactor->AA_h[offset] = 1.0; 499 offset += 1; 500 v += nz; 501 } 502 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 503 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 504 } 505 } catch(char *ex) { 506 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 507 } 508 } 509 PetscFunctionReturn(0); 510 } 511 512 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 513 { 514 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 515 PetscInt n = A->rmap->n; 516 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 517 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 518 cusparseStatus_t stat; 519 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 520 const MatScalar *aa = a->a,*v; 521 PetscInt *AiUp, *AjUp; 522 PetscInt i,nz, nzUpper, offset; 523 PetscErrorCode ierr; 524 cudaError_t cerr; 525 526 PetscFunctionBegin; 527 if (!n) PetscFunctionReturn(0); 528 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 529 try { 530 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 531 nzUpper = adiag[0]-adiag[n]; 532 if (!upTriFactor) { 533 PetscScalar *AAUp; 534 535 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 536 537 /* Allocate Space for the upper triangular matrix */ 538 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 539 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 540 541 /* Fill the upper triangular matrix */ 542 AiUp[0]=(PetscInt) 0; 543 AiUp[n]=nzUpper; 544 offset = nzUpper; 545 for (i=n-1; i>=0; i--) { 546 v = aa + adiag[i+1] + 1; 547 vi = aj + adiag[i+1] + 1; 548 549 /* number of elements NOT on the diagonal */ 550 nz = adiag[i] - adiag[i+1]-1; 551 552 /* decrement the offset */ 553 offset -= (nz+1); 554 555 /* first, set the diagonal elements */ 556 AjUp[offset] = (PetscInt) i; 557 AAUp[offset] = (MatScalar)1./v[nz]; 558 AiUp[i] = AiUp[i+1] - (nz+1); 559 560 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 561 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 562 } 563 564 /* allocate space for the triangular factor information */ 565 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 566 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 567 568 /* Create the matrix description */ 569 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 570 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 571 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 572 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 573 #else 574 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 575 #endif 576 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 577 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 578 579 /* set the operation */ 580 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 581 582 /* set the matrix */ 583 upTriFactor->csrMat = new CsrMatrix; 584 upTriFactor->csrMat->num_rows = n; 585 upTriFactor->csrMat->num_cols = n; 586 upTriFactor->csrMat->num_entries = nzUpper; 587 588 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 589 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 590 591 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 592 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 593 594 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 595 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 596 597 /* Create the solve analysis information */ 598 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 599 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 600 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 601 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 602 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 603 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 604 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 605 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 606 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 607 #endif 608 609 /* perform the solve analysis */ 610 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 611 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 612 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 613 upTriFactor->csrMat->column_indices->data().get(), 614 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 615 upTriFactor->solveInfo, 616 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 617 #else 618 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 619 #endif 620 cerr = WaitForCUDA();CHKERRCUDA(cerr); 621 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 622 623 /* assign the pointer */ 624 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 625 upTriFactor->AA_h = AAUp; 626 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 627 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 628 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 629 } else { 630 if (!upTriFactor->AA_h) { 631 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 632 } 633 /* Fill the upper triangular matrix */ 634 offset = nzUpper; 635 for (i=n-1; i>=0; i--) { 636 v = aa + adiag[i+1] + 1; 637 638 /* number of elements NOT on the diagonal */ 639 nz = adiag[i] - adiag[i+1]-1; 640 641 /* decrement the offset */ 642 offset -= (nz+1); 643 644 /* first, set the diagonal elements */ 645 upTriFactor->AA_h[offset] = 1./v[nz]; 646 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 647 } 648 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 649 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 650 } 651 } catch(char *ex) { 652 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 653 } 654 } 655 PetscFunctionReturn(0); 656 } 657 658 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 659 { 660 PetscErrorCode ierr; 661 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 662 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 663 IS isrow = a->row,iscol = a->icol; 664 PetscBool row_identity,col_identity; 665 PetscInt n = A->rmap->n; 666 667 PetscFunctionBegin; 668 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 669 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 670 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 671 672 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 673 cusparseTriFactors->nnz=a->nz; 674 675 A->offloadmask = PETSC_OFFLOAD_BOTH; 676 /* lower triangular indices */ 677 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 678 if (!row_identity && !cusparseTriFactors->rpermIndices) { 679 const PetscInt *r; 680 681 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 682 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 683 cusparseTriFactors->rpermIndices->assign(r, r+n); 684 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 685 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 686 } 687 688 /* upper triangular indices */ 689 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 690 if (!col_identity && !cusparseTriFactors->cpermIndices) { 691 const PetscInt *c; 692 693 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 694 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 695 cusparseTriFactors->cpermIndices->assign(c, c+n); 696 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 697 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 698 } 699 PetscFunctionReturn(0); 700 } 701 702 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 703 { 704 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 705 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 706 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 707 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 708 cusparseStatus_t stat; 709 PetscErrorCode ierr; 710 cudaError_t cerr; 711 PetscInt *AiUp, *AjUp; 712 PetscScalar *AAUp; 713 PetscScalar *AALo; 714 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 715 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 716 const PetscInt *ai = b->i,*aj = b->j,*vj; 717 const MatScalar *aa = b->a,*v; 718 719 PetscFunctionBegin; 720 if (!n) PetscFunctionReturn(0); 721 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 722 try { 723 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 724 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 725 if (!upTriFactor && !loTriFactor) { 726 /* Allocate Space for the upper triangular matrix */ 727 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 728 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 729 730 /* Fill the upper triangular matrix */ 731 AiUp[0]=(PetscInt) 0; 732 AiUp[n]=nzUpper; 733 offset = 0; 734 for (i=0; i<n; i++) { 735 /* set the pointers */ 736 v = aa + ai[i]; 737 vj = aj + ai[i]; 738 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 739 740 /* first, set the diagonal elements */ 741 AjUp[offset] = (PetscInt) i; 742 AAUp[offset] = (MatScalar)1.0/v[nz]; 743 AiUp[i] = offset; 744 AALo[offset] = (MatScalar)1.0/v[nz]; 745 746 offset+=1; 747 if (nz>0) { 748 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 749 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 750 for (j=offset; j<offset+nz; j++) { 751 AAUp[j] = -AAUp[j]; 752 AALo[j] = AAUp[j]/v[nz]; 753 } 754 offset+=nz; 755 } 756 } 757 758 /* allocate space for the triangular factor information */ 759 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 760 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 761 762 /* Create the matrix description */ 763 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 764 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 765 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 766 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 767 #else 768 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 769 #endif 770 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 771 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 772 773 /* set the matrix */ 774 upTriFactor->csrMat = new CsrMatrix; 775 upTriFactor->csrMat->num_rows = A->rmap->n; 776 upTriFactor->csrMat->num_cols = A->cmap->n; 777 upTriFactor->csrMat->num_entries = a->nz; 778 779 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 780 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 781 782 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 783 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 784 785 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 786 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 787 788 /* set the operation */ 789 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 790 791 /* Create the solve analysis information */ 792 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 793 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 794 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 795 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 796 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 797 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 798 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 799 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 800 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 801 #endif 802 803 /* perform the solve analysis */ 804 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 805 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 806 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 807 upTriFactor->csrMat->column_indices->data().get(), 808 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 809 upTriFactor->solveInfo, 810 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 811 #else 812 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 813 #endif 814 cerr = WaitForCUDA();CHKERRCUDA(cerr); 815 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 816 817 /* assign the pointer */ 818 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 819 820 /* allocate space for the triangular factor information */ 821 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 822 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 823 824 /* Create the matrix description */ 825 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 826 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 827 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 828 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 829 #else 830 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 831 #endif 832 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 833 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 834 835 /* set the operation */ 836 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 837 838 /* set the matrix */ 839 loTriFactor->csrMat = new CsrMatrix; 840 loTriFactor->csrMat->num_rows = A->rmap->n; 841 loTriFactor->csrMat->num_cols = A->cmap->n; 842 loTriFactor->csrMat->num_entries = a->nz; 843 844 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 845 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 846 847 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 848 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 849 850 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 851 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 852 853 /* Create the solve analysis information */ 854 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 855 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 856 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 857 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 858 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 859 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 860 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 861 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 862 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 863 #endif 864 865 /* perform the solve analysis */ 866 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 867 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 868 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 869 loTriFactor->csrMat->column_indices->data().get(), 870 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 871 loTriFactor->solveInfo, 872 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 873 #else 874 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 875 #endif 876 cerr = WaitForCUDA();CHKERRCUDA(cerr); 877 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 878 879 /* assign the pointer */ 880 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 881 882 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 883 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 884 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 885 } else { 886 /* Fill the upper triangular matrix */ 887 offset = 0; 888 for (i=0; i<n; i++) { 889 /* set the pointers */ 890 v = aa + ai[i]; 891 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 892 893 /* first, set the diagonal elements */ 894 AAUp[offset] = 1.0/v[nz]; 895 AALo[offset] = 1.0/v[nz]; 896 897 offset+=1; 898 if (nz>0) { 899 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 900 for (j=offset; j<offset+nz; j++) { 901 AAUp[j] = -AAUp[j]; 902 AALo[j] = AAUp[j]/v[nz]; 903 } 904 offset+=nz; 905 } 906 } 907 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 908 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 909 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 910 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 911 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 912 } 913 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 914 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 915 } catch(char *ex) { 916 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 917 } 918 } 919 PetscFunctionReturn(0); 920 } 921 922 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 923 { 924 PetscErrorCode ierr; 925 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 926 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 927 IS ip = a->row; 928 PetscBool perm_identity; 929 PetscInt n = A->rmap->n; 930 931 PetscFunctionBegin; 932 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 933 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 934 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 935 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 936 937 A->offloadmask = PETSC_OFFLOAD_BOTH; 938 939 /* lower triangular indices */ 940 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 941 if (!perm_identity) { 942 IS iip; 943 const PetscInt *irip,*rip; 944 945 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 946 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 947 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 948 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 949 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 950 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 951 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 952 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 953 ierr = ISDestroy(&iip);CHKERRQ(ierr); 954 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 955 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 956 } 957 PetscFunctionReturn(0); 958 } 959 960 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 961 { 962 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 963 IS ip = b->row; 964 PetscBool perm_identity; 965 PetscErrorCode ierr; 966 967 PetscFunctionBegin; 968 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 969 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 970 B->offloadmask = PETSC_OFFLOAD_CPU; 971 /* determine which version of MatSolve needs to be used. */ 972 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 973 if (perm_identity) { 974 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 975 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 976 B->ops->matsolve = NULL; 977 B->ops->matsolvetranspose = NULL; 978 } else { 979 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 980 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 981 B->ops->matsolve = NULL; 982 B->ops->matsolvetranspose = NULL; 983 } 984 985 /* get the triangular factors */ 986 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 987 PetscFunctionReturn(0); 988 } 989 990 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 991 { 992 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 993 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 994 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 995 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 996 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 997 cusparseStatus_t stat; 998 cusparseIndexBase_t indexBase; 999 cusparseMatrixType_t matrixType; 1000 cusparseFillMode_t fillMode; 1001 cusparseDiagType_t diagType; 1002 cudaError_t cerr; 1003 PetscErrorCode ierr; 1004 1005 PetscFunctionBegin; 1006 /* allocate space for the transpose of the lower triangular factor */ 1007 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1008 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1009 1010 /* set the matrix descriptors of the lower triangular factor */ 1011 matrixType = cusparseGetMatType(loTriFactor->descr); 1012 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1013 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1014 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1015 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1016 1017 /* Create the matrix description */ 1018 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1019 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1020 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1021 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1022 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1023 1024 /* set the operation */ 1025 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1026 1027 /* allocate GPU space for the CSC of the lower triangular factor*/ 1028 loTriFactorT->csrMat = new CsrMatrix; 1029 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1030 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1031 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1032 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1033 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1034 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1035 1036 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1037 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1038 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1039 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1040 loTriFactor->csrMat->values->data().get(), 1041 loTriFactor->csrMat->row_offsets->data().get(), 1042 loTriFactor->csrMat->column_indices->data().get(), 1043 loTriFactorT->csrMat->values->data().get(), 1044 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1045 CUSPARSE_ACTION_NUMERIC,indexBase, 1046 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1047 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1048 #endif 1049 1050 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1051 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1052 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1053 loTriFactor->csrMat->values->data().get(), 1054 loTriFactor->csrMat->row_offsets->data().get(), 1055 loTriFactor->csrMat->column_indices->data().get(), 1056 loTriFactorT->csrMat->values->data().get(), 1057 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1058 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1059 CUSPARSE_ACTION_NUMERIC, indexBase, 1060 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1061 #else 1062 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1063 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1064 #endif 1065 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1066 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1067 1068 /* Create the solve analysis information */ 1069 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1070 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1071 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1072 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1073 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1074 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1075 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1076 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1077 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1078 #endif 1079 1080 /* perform the solve analysis */ 1081 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1082 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1083 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1084 loTriFactorT->csrMat->column_indices->data().get(), 1085 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1086 loTriFactorT->solveInfo, 1087 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1088 #else 1089 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1090 #endif 1091 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1092 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1093 1094 /* assign the pointer */ 1095 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1096 1097 /*********************************************/ 1098 /* Now the Transpose of the Upper Tri Factor */ 1099 /*********************************************/ 1100 1101 /* allocate space for the transpose of the upper triangular factor */ 1102 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1103 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1104 1105 /* set the matrix descriptors of the upper triangular factor */ 1106 matrixType = cusparseGetMatType(upTriFactor->descr); 1107 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1108 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1109 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1110 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1111 1112 /* Create the matrix description */ 1113 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1114 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1115 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1116 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1117 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1118 1119 /* set the operation */ 1120 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1121 1122 /* allocate GPU space for the CSC of the upper triangular factor*/ 1123 upTriFactorT->csrMat = new CsrMatrix; 1124 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1125 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1126 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1127 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1128 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1129 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1130 1131 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1132 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1133 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1134 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1135 upTriFactor->csrMat->values->data().get(), 1136 upTriFactor->csrMat->row_offsets->data().get(), 1137 upTriFactor->csrMat->column_indices->data().get(), 1138 upTriFactorT->csrMat->values->data().get(), 1139 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1140 CUSPARSE_ACTION_NUMERIC,indexBase, 1141 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1142 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1143 #endif 1144 1145 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1146 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1147 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1148 upTriFactor->csrMat->values->data().get(), 1149 upTriFactor->csrMat->row_offsets->data().get(), 1150 upTriFactor->csrMat->column_indices->data().get(), 1151 upTriFactorT->csrMat->values->data().get(), 1152 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1153 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1154 CUSPARSE_ACTION_NUMERIC, indexBase, 1155 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1156 #else 1157 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1158 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1159 #endif 1160 1161 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1162 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1163 1164 /* Create the solve analysis information */ 1165 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1166 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1167 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1168 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1169 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1170 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1171 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1172 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1173 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1174 #endif 1175 1176 /* perform the solve analysis */ 1177 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1178 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1179 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1180 upTriFactorT->csrMat->column_indices->data().get(), 1181 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1182 upTriFactorT->solveInfo, 1183 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1184 #else 1185 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1186 #endif 1187 1188 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1189 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1190 1191 /* assign the pointer */ 1192 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1193 PetscFunctionReturn(0); 1194 } 1195 1196 struct PetscScalarToPetscInt 1197 { 1198 __host__ __device__ 1199 PetscInt operator()(PetscScalar s) 1200 { 1201 return (PetscInt)PetscRealPart(s); 1202 } 1203 }; 1204 1205 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A) 1206 { 1207 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1208 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1209 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1210 cusparseStatus_t stat; 1211 cusparseIndexBase_t indexBase; 1212 cudaError_t err; 1213 PetscErrorCode ierr; 1214 1215 PetscFunctionBegin; 1216 if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 1217 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1218 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1219 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1220 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1221 if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1222 if (A->transupdated) PetscFunctionReturn(0); 1223 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1224 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1225 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1226 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1227 } 1228 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1229 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1230 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1231 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1232 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1233 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1234 1235 /* set alpha and beta */ 1236 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1237 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1238 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1239 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1240 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1241 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1242 1243 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1244 CsrMatrix *matrixT = new CsrMatrix; 1245 matstructT->mat = matrixT; 1246 matrixT->num_rows = A->cmap->n; 1247 matrixT->num_cols = A->rmap->n; 1248 matrixT->num_entries = a->nz; 1249 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1250 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1251 matrixT->values = new THRUSTARRAY(a->nz); 1252 1253 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1254 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1255 1256 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1257 stat = cusparseCreateCsr(&matstructT->matDescr, 1258 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1259 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1260 matrixT->values->data().get(), 1261 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1262 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1263 #endif 1264 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1265 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1266 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1267 #else 1268 CsrMatrix *temp = new CsrMatrix; 1269 CsrMatrix *tempT = new CsrMatrix; 1270 /* First convert HYB to CSR */ 1271 temp->num_rows = A->rmap->n; 1272 temp->num_cols = A->cmap->n; 1273 temp->num_entries = a->nz; 1274 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1275 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1276 temp->values = new THRUSTARRAY(a->nz); 1277 1278 stat = cusparse_hyb2csr(cusparsestruct->handle, 1279 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1280 temp->values->data().get(), 1281 temp->row_offsets->data().get(), 1282 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1283 1284 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1285 tempT->num_rows = A->rmap->n; 1286 tempT->num_cols = A->cmap->n; 1287 tempT->num_entries = a->nz; 1288 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1289 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1290 tempT->values = new THRUSTARRAY(a->nz); 1291 1292 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1293 temp->num_cols, temp->num_entries, 1294 temp->values->data().get(), 1295 temp->row_offsets->data().get(), 1296 temp->column_indices->data().get(), 1297 tempT->values->data().get(), 1298 tempT->column_indices->data().get(), 1299 tempT->row_offsets->data().get(), 1300 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1301 1302 /* Last, convert CSC to HYB */ 1303 cusparseHybMat_t hybMat; 1304 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1305 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1306 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1307 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1308 matstructT->descr, tempT->values->data().get(), 1309 tempT->row_offsets->data().get(), 1310 tempT->column_indices->data().get(), 1311 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1312 1313 /* assign the pointer */ 1314 matstructT->mat = hybMat; 1315 A->transupdated = PETSC_TRUE; 1316 /* delete temporaries */ 1317 if (tempT) { 1318 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1319 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1320 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1321 delete (CsrMatrix*) tempT; 1322 } 1323 if (temp) { 1324 if (temp->values) delete (THRUSTARRAY*) temp->values; 1325 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1326 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1327 delete (CsrMatrix*) temp; 1328 } 1329 #endif 1330 } 1331 } 1332 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1333 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1334 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1335 if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1336 if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1337 if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1338 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1339 if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1340 if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1341 if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1342 if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1343 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1344 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1345 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1346 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1347 } 1348 if (!cusparsestruct->csr2csc_i) { 1349 THRUSTARRAY csr2csc_a(matrix->num_entries); 1350 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1351 1352 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1353 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1354 void *csr2cscBuffer; 1355 size_t csr2cscBufferSize; 1356 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1357 A->cmap->n, matrix->num_entries, 1358 matrix->values->data().get(), 1359 cusparsestruct->rowoffsets_gpu->data().get(), 1360 matrix->column_indices->data().get(), 1361 matrixT->values->data().get(), 1362 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1363 CUSPARSE_ACTION_NUMERIC,indexBase, 1364 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1365 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1366 #endif 1367 1368 if (matrix->num_entries) { 1369 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1370 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1371 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1372 1373 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1374 should be filled with indexBase. So I just take a shortcut here. 1375 */ 1376 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1377 A->cmap->n,matrix->num_entries, 1378 csr2csc_a.data().get(), 1379 cusparsestruct->rowoffsets_gpu->data().get(), 1380 matrix->column_indices->data().get(), 1381 matrixT->values->data().get(), 1382 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1383 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1384 CUSPARSE_ACTION_NUMERIC,indexBase, 1385 cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1386 #else 1387 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1388 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1389 #endif 1390 } else { 1391 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1392 } 1393 1394 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1395 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1396 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1397 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1398 #endif 1399 } 1400 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1401 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1402 matrixT->values->begin())); 1403 } 1404 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1405 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1406 /* the compressed row indices is not used for matTranspose */ 1407 matstructT->cprowIndices = NULL; 1408 /* assign the pointer */ 1409 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1410 A->transupdated = PETSC_TRUE; 1411 PetscFunctionReturn(0); 1412 } 1413 1414 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1415 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1416 { 1417 PetscInt n = xx->map->n; 1418 const PetscScalar *barray; 1419 PetscScalar *xarray; 1420 thrust::device_ptr<const PetscScalar> bGPU; 1421 thrust::device_ptr<PetscScalar> xGPU; 1422 cusparseStatus_t stat; 1423 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1424 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1425 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1426 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1427 PetscErrorCode ierr; 1428 1429 PetscFunctionBegin; 1430 /* Analyze the matrix and create the transpose ... on the fly */ 1431 if (!loTriFactorT && !upTriFactorT) { 1432 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1433 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1434 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1435 } 1436 1437 /* Get the GPU pointers */ 1438 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1439 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1440 xGPU = thrust::device_pointer_cast(xarray); 1441 bGPU = thrust::device_pointer_cast(barray); 1442 1443 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1444 /* First, reorder with the row permutation */ 1445 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1446 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1447 xGPU); 1448 1449 /* First, solve U */ 1450 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1451 upTriFactorT->csrMat->num_rows, 1452 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1453 upTriFactorT->csrMat->num_entries, 1454 #endif 1455 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1456 upTriFactorT->csrMat->values->data().get(), 1457 upTriFactorT->csrMat->row_offsets->data().get(), 1458 upTriFactorT->csrMat->column_indices->data().get(), 1459 upTriFactorT->solveInfo, 1460 xarray, 1461 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1462 tempGPU->data().get(), 1463 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1464 #else 1465 tempGPU->data().get());CHKERRCUSPARSE(stat); 1466 #endif 1467 1468 /* Then, solve L */ 1469 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1470 loTriFactorT->csrMat->num_rows, 1471 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1472 loTriFactorT->csrMat->num_entries, 1473 #endif 1474 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1475 loTriFactorT->csrMat->values->data().get(), 1476 loTriFactorT->csrMat->row_offsets->data().get(), 1477 loTriFactorT->csrMat->column_indices->data().get(), 1478 loTriFactorT->solveInfo, 1479 tempGPU->data().get(), 1480 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1481 xarray, 1482 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1483 #else 1484 xarray);CHKERRCUSPARSE(stat); 1485 #endif 1486 1487 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1488 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1489 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1490 tempGPU->begin()); 1491 1492 /* Copy the temporary to the full solution. */ 1493 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1494 1495 /* restore */ 1496 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1497 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1498 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1499 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1500 PetscFunctionReturn(0); 1501 } 1502 1503 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1504 { 1505 const PetscScalar *barray; 1506 PetscScalar *xarray; 1507 cusparseStatus_t stat; 1508 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1509 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1510 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1511 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1512 PetscErrorCode ierr; 1513 1514 PetscFunctionBegin; 1515 /* Analyze the matrix and create the transpose ... on the fly */ 1516 if (!loTriFactorT && !upTriFactorT) { 1517 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1518 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1519 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1520 } 1521 1522 /* Get the GPU pointers */ 1523 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1524 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1525 1526 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1527 /* First, solve U */ 1528 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1529 upTriFactorT->csrMat->num_rows, 1530 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1531 upTriFactorT->csrMat->num_entries, 1532 #endif 1533 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1534 upTriFactorT->csrMat->values->data().get(), 1535 upTriFactorT->csrMat->row_offsets->data().get(), 1536 upTriFactorT->csrMat->column_indices->data().get(), 1537 upTriFactorT->solveInfo, 1538 barray, 1539 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1540 tempGPU->data().get(), 1541 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1542 #else 1543 tempGPU->data().get());CHKERRCUSPARSE(stat); 1544 #endif 1545 1546 /* Then, solve L */ 1547 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1548 loTriFactorT->csrMat->num_rows, 1549 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1550 loTriFactorT->csrMat->num_entries, 1551 #endif 1552 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1553 loTriFactorT->csrMat->values->data().get(), 1554 loTriFactorT->csrMat->row_offsets->data().get(), 1555 loTriFactorT->csrMat->column_indices->data().get(), 1556 loTriFactorT->solveInfo, 1557 tempGPU->data().get(), 1558 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1559 xarray, 1560 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1561 #else 1562 xarray);CHKERRCUSPARSE(stat); 1563 #endif 1564 1565 /* restore */ 1566 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1567 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1568 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1569 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1570 PetscFunctionReturn(0); 1571 } 1572 1573 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1574 { 1575 const PetscScalar *barray; 1576 PetscScalar *xarray; 1577 thrust::device_ptr<const PetscScalar> bGPU; 1578 thrust::device_ptr<PetscScalar> xGPU; 1579 cusparseStatus_t stat; 1580 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1581 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1582 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1583 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1584 PetscErrorCode ierr; 1585 1586 PetscFunctionBegin; 1587 1588 /* Get the GPU pointers */ 1589 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1590 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1591 xGPU = thrust::device_pointer_cast(xarray); 1592 bGPU = thrust::device_pointer_cast(barray); 1593 1594 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1595 /* First, reorder with the row permutation */ 1596 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1597 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1598 tempGPU->begin()); 1599 1600 /* Next, solve L */ 1601 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1602 loTriFactor->csrMat->num_rows, 1603 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1604 loTriFactor->csrMat->num_entries, 1605 #endif 1606 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1607 loTriFactor->csrMat->values->data().get(), 1608 loTriFactor->csrMat->row_offsets->data().get(), 1609 loTriFactor->csrMat->column_indices->data().get(), 1610 loTriFactor->solveInfo, 1611 tempGPU->data().get(), 1612 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1613 xarray, 1614 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1615 #else 1616 xarray);CHKERRCUSPARSE(stat); 1617 #endif 1618 1619 /* Then, solve U */ 1620 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1621 upTriFactor->csrMat->num_rows, 1622 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1623 upTriFactor->csrMat->num_entries, 1624 #endif 1625 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1626 upTriFactor->csrMat->values->data().get(), 1627 upTriFactor->csrMat->row_offsets->data().get(), 1628 upTriFactor->csrMat->column_indices->data().get(), 1629 upTriFactor->solveInfo,xarray, 1630 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1631 tempGPU->data().get(), 1632 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1633 #else 1634 tempGPU->data().get());CHKERRCUSPARSE(stat); 1635 #endif 1636 1637 /* Last, reorder with the column permutation */ 1638 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1639 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1640 xGPU); 1641 1642 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1643 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1644 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1645 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1646 PetscFunctionReturn(0); 1647 } 1648 1649 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1650 { 1651 const PetscScalar *barray; 1652 PetscScalar *xarray; 1653 cusparseStatus_t stat; 1654 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1655 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1656 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1657 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1658 PetscErrorCode ierr; 1659 1660 PetscFunctionBegin; 1661 /* Get the GPU pointers */ 1662 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1663 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1664 1665 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1666 /* First, solve L */ 1667 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1668 loTriFactor->csrMat->num_rows, 1669 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1670 loTriFactor->csrMat->num_entries, 1671 #endif 1672 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1673 loTriFactor->csrMat->values->data().get(), 1674 loTriFactor->csrMat->row_offsets->data().get(), 1675 loTriFactor->csrMat->column_indices->data().get(), 1676 loTriFactor->solveInfo, 1677 barray, 1678 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1679 tempGPU->data().get(), 1680 loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1681 #else 1682 tempGPU->data().get());CHKERRCUSPARSE(stat); 1683 #endif 1684 1685 /* Next, solve U */ 1686 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1687 upTriFactor->csrMat->num_rows, 1688 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1689 upTriFactor->csrMat->num_entries, 1690 #endif 1691 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1692 upTriFactor->csrMat->values->data().get(), 1693 upTriFactor->csrMat->row_offsets->data().get(), 1694 upTriFactor->csrMat->column_indices->data().get(), 1695 upTriFactor->solveInfo, 1696 tempGPU->data().get(), 1697 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1698 xarray, 1699 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1700 #else 1701 xarray);CHKERRCUSPARSE(stat); 1702 #endif 1703 1704 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1705 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1706 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1707 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1708 PetscFunctionReturn(0); 1709 } 1710 1711 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1712 { 1713 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1714 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1715 cudaError_t cerr; 1716 PetscErrorCode ierr; 1717 1718 PetscFunctionBegin; 1719 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1720 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1721 1722 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1723 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1724 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1725 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1726 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1727 A->offloadmask = PETSC_OFFLOAD_BOTH; 1728 } 1729 PetscFunctionReturn(0); 1730 } 1731 1732 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1733 { 1734 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1735 PetscErrorCode ierr; 1736 1737 PetscFunctionBegin; 1738 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1739 *array = a->a; 1740 A->offloadmask = PETSC_OFFLOAD_CPU; 1741 PetscFunctionReturn(0); 1742 } 1743 1744 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1745 { 1746 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1747 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1748 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1749 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1750 PetscErrorCode ierr; 1751 cusparseStatus_t stat; 1752 PetscBool both = PETSC_TRUE; 1753 cudaError_t err; 1754 1755 PetscFunctionBegin; 1756 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1757 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1758 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1759 CsrMatrix *matrix; 1760 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1761 1762 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1763 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1764 matrix->values->assign(a->a, a->a+a->nz); 1765 err = WaitForCUDA();CHKERRCUDA(err); 1766 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1767 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1768 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1769 } else { 1770 PetscInt nnz; 1771 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1772 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1773 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1774 delete cusparsestruct->workVector; 1775 delete cusparsestruct->rowoffsets_gpu; 1776 cusparsestruct->workVector = NULL; 1777 cusparsestruct->rowoffsets_gpu = NULL; 1778 try { 1779 if (a->compressedrow.use) { 1780 m = a->compressedrow.nrows; 1781 ii = a->compressedrow.i; 1782 ridx = a->compressedrow.rindex; 1783 } else { 1784 m = A->rmap->n; 1785 ii = a->i; 1786 ridx = NULL; 1787 } 1788 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1789 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1790 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1791 else nnz = a->nz; 1792 1793 /* create cusparse matrix */ 1794 cusparsestruct->nrows = m; 1795 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1796 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1797 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1798 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1799 1800 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1801 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1802 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1803 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1804 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1805 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1806 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1807 1808 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1809 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1810 /* set the matrix */ 1811 CsrMatrix *mat= new CsrMatrix; 1812 mat->num_rows = m; 1813 mat->num_cols = A->cmap->n; 1814 mat->num_entries = nnz; 1815 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1816 mat->row_offsets->assign(ii, ii + m+1); 1817 1818 mat->column_indices = new THRUSTINTARRAY32(nnz); 1819 mat->column_indices->assign(a->j, a->j+nnz); 1820 1821 mat->values = new THRUSTARRAY(nnz); 1822 if (a->a) mat->values->assign(a->a, a->a+nnz); 1823 1824 /* assign the pointer */ 1825 matstruct->mat = mat; 1826 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1827 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1828 stat = cusparseCreateCsr(&matstruct->matDescr, 1829 mat->num_rows, mat->num_cols, mat->num_entries, 1830 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1831 mat->values->data().get(), 1832 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1833 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1834 } 1835 #endif 1836 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1837 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1838 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1839 #else 1840 CsrMatrix *mat= new CsrMatrix; 1841 mat->num_rows = m; 1842 mat->num_cols = A->cmap->n; 1843 mat->num_entries = nnz; 1844 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1845 mat->row_offsets->assign(ii, ii + m+1); 1846 1847 mat->column_indices = new THRUSTINTARRAY32(nnz); 1848 mat->column_indices->assign(a->j, a->j+nnz); 1849 1850 mat->values = new THRUSTARRAY(nnz); 1851 if (a->a) mat->values->assign(a->a, a->a+nnz); 1852 1853 cusparseHybMat_t hybMat; 1854 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1855 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1856 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1857 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1858 matstruct->descr, mat->values->data().get(), 1859 mat->row_offsets->data().get(), 1860 mat->column_indices->data().get(), 1861 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1862 /* assign the pointer */ 1863 matstruct->mat = hybMat; 1864 1865 if (mat) { 1866 if (mat->values) delete (THRUSTARRAY*)mat->values; 1867 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1868 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1869 delete (CsrMatrix*)mat; 1870 } 1871 #endif 1872 } 1873 1874 /* assign the compressed row indices */ 1875 if (a->compressedrow.use) { 1876 cusparsestruct->workVector = new THRUSTARRAY(m); 1877 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1878 matstruct->cprowIndices->assign(ridx,ridx+m); 1879 tmp = m; 1880 } else { 1881 cusparsestruct->workVector = NULL; 1882 matstruct->cprowIndices = NULL; 1883 tmp = 0; 1884 } 1885 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1886 1887 /* assign the pointer */ 1888 cusparsestruct->mat = matstruct; 1889 } catch(char *ex) { 1890 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1891 } 1892 err = WaitForCUDA();CHKERRCUDA(err); 1893 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1894 cusparsestruct->nonzerostate = A->nonzerostate; 1895 } 1896 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1897 } 1898 PetscFunctionReturn(0); 1899 } 1900 1901 struct VecCUDAPlusEquals 1902 { 1903 template <typename Tuple> 1904 __host__ __device__ 1905 void operator()(Tuple t) 1906 { 1907 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1908 } 1909 }; 1910 1911 struct VecCUDAEquals 1912 { 1913 template <typename Tuple> 1914 __host__ __device__ 1915 void operator()(Tuple t) 1916 { 1917 thrust::get<1>(t) = thrust::get<0>(t); 1918 } 1919 }; 1920 1921 struct VecCUDAEqualsReverse 1922 { 1923 template <typename Tuple> 1924 __host__ __device__ 1925 void operator()(Tuple t) 1926 { 1927 thrust::get<0>(t) = thrust::get<1>(t); 1928 } 1929 }; 1930 1931 struct MatMatCusparse { 1932 PetscBool cisdense; 1933 PetscScalar *Bt; 1934 Mat X; 1935 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1936 PetscLogDouble flops; 1937 CsrMatrix *Bcsr; 1938 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1939 cusparseSpMatDescr_t matSpBDescr; 1940 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1941 cusparseDnMatDescr_t matBDescr; 1942 cusparseDnMatDescr_t matCDescr; 1943 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1944 size_t mmBufferSize; 1945 void *mmBuffer; 1946 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1947 cusparseSpGEMMDescr_t spgemmDesc; 1948 #endif 1949 }; 1950 1951 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1952 { 1953 PetscErrorCode ierr; 1954 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1955 cudaError_t cerr; 1956 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1957 cusparseStatus_t stat; 1958 #endif 1959 1960 PetscFunctionBegin; 1961 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1962 delete mmdata->Bcsr; 1963 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1964 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1965 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1966 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1967 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1968 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1969 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1970 #endif 1971 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1972 ierr = PetscFree(data);CHKERRQ(ierr); 1973 PetscFunctionReturn(0); 1974 } 1975 1976 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1977 1978 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1979 { 1980 Mat_Product *product = C->product; 1981 Mat A,B; 1982 PetscInt m,n,blda,clda; 1983 PetscBool flg,biscuda; 1984 Mat_SeqAIJCUSPARSE *cusp; 1985 cusparseStatus_t stat; 1986 cusparseOperation_t opA; 1987 const PetscScalar *barray; 1988 PetscScalar *carray; 1989 PetscErrorCode ierr; 1990 MatMatCusparse *mmdata; 1991 Mat_SeqAIJCUSPARSEMultStruct *mat; 1992 CsrMatrix *csrmat; 1993 1994 PetscFunctionBegin; 1995 MatCheckProduct(C,1); 1996 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 1997 mmdata = (MatMatCusparse*)product->data; 1998 A = product->A; 1999 B = product->B; 2000 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2001 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2002 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2003 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2004 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2005 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2006 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2007 switch (product->type) { 2008 case MATPRODUCT_AB: 2009 case MATPRODUCT_PtAP: 2010 mat = cusp->mat; 2011 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2012 m = A->rmap->n; 2013 n = B->cmap->n; 2014 break; 2015 case MATPRODUCT_AtB: 2016 if (!A->form_explicit_transpose) { 2017 mat = cusp->mat; 2018 opA = CUSPARSE_OPERATION_TRANSPOSE; 2019 } else { 2020 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2021 mat = cusp->matTranspose; 2022 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2023 } 2024 m = A->cmap->n; 2025 n = B->cmap->n; 2026 break; 2027 case MATPRODUCT_ABt: 2028 case MATPRODUCT_RARt: 2029 mat = cusp->mat; 2030 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2031 m = A->rmap->n; 2032 n = B->rmap->n; 2033 break; 2034 default: 2035 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2036 } 2037 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2038 csrmat = (CsrMatrix*)mat->mat; 2039 /* if the user passed a CPU matrix, copy the data to the GPU */ 2040 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2041 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2042 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2043 2044 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2045 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2046 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2047 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2048 } else { 2049 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2050 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2051 } 2052 2053 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2054 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2055 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2056 /* (re)allcoate mmBuffer if not initialized or LDAs are different */ 2057 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2058 size_t mmBufferSize; 2059 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2060 if (!mmdata->matBDescr) { 2061 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2062 mmdata->Blda = blda; 2063 } 2064 2065 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2066 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2067 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2068 mmdata->Clda = clda; 2069 } 2070 2071 if (!mat->matDescr) { 2072 stat = cusparseCreateCsr(&mat->matDescr, 2073 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2074 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2075 csrmat->values->data().get(), 2076 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2077 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2078 } 2079 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2080 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2081 mmdata->matCDescr,cusparse_scalartype, 2082 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2083 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2084 cudaError_t cerr; 2085 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2086 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2087 mmdata->mmBufferSize = mmBufferSize; 2088 } 2089 mmdata->initialized = PETSC_TRUE; 2090 } else { 2091 /* to be safe, always update pointers of the mats */ 2092 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2093 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2094 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2095 } 2096 2097 /* do cusparseSpMM, which supports transpose on B */ 2098 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2099 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2100 mmdata->matCDescr,cusparse_scalartype, 2101 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2102 #else 2103 PetscInt k; 2104 /* cusparseXcsrmm does not support transpose on B */ 2105 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2106 cublasHandle_t cublasv2handle; 2107 cublasStatus_t cerr; 2108 2109 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2110 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2111 B->cmap->n,B->rmap->n, 2112 &PETSC_CUSPARSE_ONE ,barray,blda, 2113 &PETSC_CUSPARSE_ZERO,barray,blda, 2114 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2115 blda = B->cmap->n; 2116 k = B->cmap->n; 2117 } else { 2118 k = B->rmap->n; 2119 } 2120 2121 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2122 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2123 csrmat->num_entries,mat->alpha_one,mat->descr, 2124 csrmat->values->data().get(), 2125 csrmat->row_offsets->data().get(), 2126 csrmat->column_indices->data().get(), 2127 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2128 carray,clda);CHKERRCUSPARSE(stat); 2129 #endif 2130 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2131 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2132 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2133 if (product->type == MATPRODUCT_RARt) { 2134 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2135 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2136 } else if (product->type == MATPRODUCT_PtAP) { 2137 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2138 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2139 } else { 2140 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2141 } 2142 if (mmdata->cisdense) { 2143 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2144 } 2145 if (!biscuda) { 2146 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2147 } 2148 PetscFunctionReturn(0); 2149 } 2150 2151 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2152 { 2153 Mat_Product *product = C->product; 2154 Mat A,B; 2155 PetscInt m,n; 2156 PetscBool cisdense,flg; 2157 PetscErrorCode ierr; 2158 MatMatCusparse *mmdata; 2159 Mat_SeqAIJCUSPARSE *cusp; 2160 2161 PetscFunctionBegin; 2162 MatCheckProduct(C,1); 2163 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2164 A = product->A; 2165 B = product->B; 2166 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2167 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2168 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2169 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2170 switch (product->type) { 2171 case MATPRODUCT_AB: 2172 m = A->rmap->n; 2173 n = B->cmap->n; 2174 break; 2175 case MATPRODUCT_AtB: 2176 m = A->cmap->n; 2177 n = B->cmap->n; 2178 break; 2179 case MATPRODUCT_ABt: 2180 m = A->rmap->n; 2181 n = B->rmap->n; 2182 break; 2183 case MATPRODUCT_PtAP: 2184 m = B->cmap->n; 2185 n = B->cmap->n; 2186 break; 2187 case MATPRODUCT_RARt: 2188 m = B->rmap->n; 2189 n = B->rmap->n; 2190 break; 2191 default: 2192 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2193 } 2194 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2195 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2196 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2197 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2198 2199 /* product data */ 2200 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2201 mmdata->cisdense = cisdense; 2202 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2203 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2204 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2205 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2206 } 2207 #endif 2208 /* for these products we need intermediate storage */ 2209 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2210 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2211 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2212 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2213 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2214 } else { 2215 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2216 } 2217 } 2218 C->product->data = mmdata; 2219 C->product->destroy = MatDestroy_MatMatCusparse; 2220 2221 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2222 PetscFunctionReturn(0); 2223 } 2224 2225 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2226 { 2227 Mat_Product *product = C->product; 2228 Mat A,B; 2229 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2230 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2231 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2232 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2233 PetscBool flg; 2234 PetscErrorCode ierr; 2235 cusparseStatus_t stat; 2236 cudaError_t cerr; 2237 MatProductType ptype; 2238 MatMatCusparse *mmdata; 2239 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2240 cusparseSpMatDescr_t BmatSpDescr; 2241 #endif 2242 2243 PetscFunctionBegin; 2244 MatCheckProduct(C,1); 2245 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2246 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2247 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2248 mmdata = (MatMatCusparse*)C->product->data; 2249 A = product->A; 2250 B = product->B; 2251 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2252 mmdata->reusesym = PETSC_FALSE; 2253 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2254 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2255 Cmat = Ccusp->mat; 2256 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2257 Ccsr = (CsrMatrix*)Cmat->mat; 2258 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2259 goto finalize; 2260 } 2261 if (!c->nz) goto finalize; 2262 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2263 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2264 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2265 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2266 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2267 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2268 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2269 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2270 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2271 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2272 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2273 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2274 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2275 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2276 2277 ptype = product->type; 2278 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2279 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2280 switch (ptype) { 2281 case MATPRODUCT_AB: 2282 Amat = Acusp->mat; 2283 Bmat = Bcusp->mat; 2284 break; 2285 case MATPRODUCT_AtB: 2286 Amat = Acusp->matTranspose; 2287 Bmat = Bcusp->mat; 2288 break; 2289 case MATPRODUCT_ABt: 2290 Amat = Acusp->mat; 2291 Bmat = Bcusp->matTranspose; 2292 break; 2293 default: 2294 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2295 } 2296 Cmat = Ccusp->mat; 2297 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2298 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2299 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2300 Acsr = (CsrMatrix*)Amat->mat; 2301 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2302 Ccsr = (CsrMatrix*)Cmat->mat; 2303 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2304 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2305 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2306 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2307 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2308 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2309 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2310 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2311 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2312 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2313 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2314 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2315 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2316 #else 2317 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2318 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2319 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2320 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2321 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2322 #endif 2323 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2324 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2325 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2326 C->offloadmask = PETSC_OFFLOAD_GPU; 2327 finalize: 2328 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2329 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2330 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2331 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2332 c->reallocs = 0; 2333 C->info.mallocs += 0; 2334 C->info.nz_unneeded = 0; 2335 C->assembled = C->was_assembled = PETSC_TRUE; 2336 C->num_ass++; 2337 PetscFunctionReturn(0); 2338 } 2339 2340 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2341 { 2342 Mat_Product *product = C->product; 2343 Mat A,B; 2344 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2345 Mat_SeqAIJ *a,*b,*c; 2346 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2347 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2348 PetscInt i,j,m,n,k; 2349 PetscBool flg; 2350 PetscErrorCode ierr; 2351 cusparseStatus_t stat; 2352 cudaError_t cerr; 2353 MatProductType ptype; 2354 MatMatCusparse *mmdata; 2355 PetscLogDouble flops; 2356 PetscBool biscompressed,ciscompressed; 2357 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2358 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2359 size_t bufSize2; 2360 cusparseSpMatDescr_t BmatSpDescr; 2361 #else 2362 int cnz; 2363 #endif 2364 2365 PetscFunctionBegin; 2366 MatCheckProduct(C,1); 2367 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2368 A = product->A; 2369 B = product->B; 2370 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2371 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2372 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2373 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2374 a = (Mat_SeqAIJ*)A->data; 2375 b = (Mat_SeqAIJ*)B->data; 2376 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2377 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2378 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2379 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2380 2381 /* product data */ 2382 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2383 C->product->data = mmdata; 2384 C->product->destroy = MatDestroy_MatMatCusparse; 2385 2386 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2387 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2388 ptype = product->type; 2389 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2390 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2391 biscompressed = PETSC_FALSE; 2392 ciscompressed = PETSC_FALSE; 2393 switch (ptype) { 2394 case MATPRODUCT_AB: 2395 m = A->rmap->n; 2396 n = B->cmap->n; 2397 k = A->cmap->n; 2398 Amat = Acusp->mat; 2399 Bmat = Bcusp->mat; 2400 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2401 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2402 break; 2403 case MATPRODUCT_AtB: 2404 m = A->cmap->n; 2405 n = B->cmap->n; 2406 k = A->rmap->n; 2407 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2408 Amat = Acusp->matTranspose; 2409 Bmat = Bcusp->mat; 2410 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2411 break; 2412 case MATPRODUCT_ABt: 2413 m = A->rmap->n; 2414 n = B->rmap->n; 2415 k = A->cmap->n; 2416 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 2417 Amat = Acusp->mat; 2418 Bmat = Bcusp->matTranspose; 2419 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2420 break; 2421 default: 2422 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2423 } 2424 2425 /* create cusparse matrix */ 2426 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2427 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2428 c = (Mat_SeqAIJ*)C->data; 2429 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2430 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2431 Ccsr = new CsrMatrix; 2432 2433 c->compressedrow.use = ciscompressed; 2434 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2435 c->compressedrow.nrows = a->compressedrow.nrows; 2436 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2437 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2438 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2439 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2440 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2441 } else { 2442 c->compressedrow.nrows = 0; 2443 c->compressedrow.i = NULL; 2444 c->compressedrow.rindex = NULL; 2445 Ccusp->workVector = NULL; 2446 Cmat->cprowIndices = NULL; 2447 } 2448 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2449 Ccusp->mat = Cmat; 2450 Ccusp->mat->mat = Ccsr; 2451 Ccsr->num_rows = Ccusp->nrows; 2452 Ccsr->num_cols = n; 2453 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2454 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2455 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2456 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2457 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2458 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2459 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2460 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2461 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2462 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2463 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2464 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2465 c->nz = 0; 2466 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2467 Ccsr->values = new THRUSTARRAY(c->nz); 2468 goto finalizesym; 2469 } 2470 2471 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2472 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2473 Acsr = (CsrMatrix*)Amat->mat; 2474 if (!biscompressed) { 2475 Bcsr = (CsrMatrix*)Bmat->mat; 2476 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2477 BmatSpDescr = Bmat->matDescr; 2478 #endif 2479 } else { /* we need to use row offsets for the full matrix */ 2480 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2481 Bcsr = new CsrMatrix; 2482 Bcsr->num_rows = B->rmap->n; 2483 Bcsr->num_cols = cBcsr->num_cols; 2484 Bcsr->num_entries = cBcsr->num_entries; 2485 Bcsr->column_indices = cBcsr->column_indices; 2486 Bcsr->values = cBcsr->values; 2487 if (!Bcusp->rowoffsets_gpu) { 2488 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2489 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2490 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2491 } 2492 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2493 mmdata->Bcsr = Bcsr; 2494 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2495 if (Bcsr->num_rows && Bcsr->num_cols) { 2496 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2497 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2498 Bcsr->values->data().get(), 2499 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2500 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2501 } 2502 BmatSpDescr = mmdata->matSpBDescr; 2503 #endif 2504 } 2505 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2506 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2507 /* precompute flops count */ 2508 if (ptype == MATPRODUCT_AB) { 2509 for (i=0, flops = 0; i<A->rmap->n; i++) { 2510 const PetscInt st = a->i[i]; 2511 const PetscInt en = a->i[i+1]; 2512 for (j=st; j<en; j++) { 2513 const PetscInt brow = a->j[j]; 2514 flops += 2.*(b->i[brow+1] - b->i[brow]); 2515 } 2516 } 2517 } else if (ptype == MATPRODUCT_AtB) { 2518 for (i=0, flops = 0; i<A->rmap->n; i++) { 2519 const PetscInt anzi = a->i[i+1] - a->i[i]; 2520 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2521 flops += (2.*anzi)*bnzi; 2522 } 2523 } else { /* TODO */ 2524 flops = 0.; 2525 } 2526 2527 mmdata->flops = flops; 2528 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2529 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2530 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2531 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2532 NULL, NULL, NULL, 2533 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2534 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2535 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2536 /* ask bufferSize bytes for external memory */ 2537 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2538 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2539 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2540 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2541 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2542 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2543 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2544 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2545 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2546 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2547 /* ask bufferSize again bytes for external memory */ 2548 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2549 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2550 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2551 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2552 /* The CUSPARSE documentation is not clear, nor the API 2553 We need both buffers to perform the operations properly! 2554 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2555 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2556 is stored in the descriptor! What a messy API... */ 2557 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2558 /* compute the intermediate product of A * B */ 2559 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2560 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2561 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2562 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2563 /* get matrix C non-zero entries C_nnz1 */ 2564 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2565 c->nz = (PetscInt) C_nnz1; 2566 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2567 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2568 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2569 Ccsr->values = new THRUSTARRAY(c->nz); 2570 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2571 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2572 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2573 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2574 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2575 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2576 #else 2577 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2578 stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2579 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2580 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2581 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2582 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2583 c->nz = cnz; 2584 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2585 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2586 Ccsr->values = new THRUSTARRAY(c->nz); 2587 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2588 2589 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2590 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2591 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2592 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2593 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2594 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2595 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2596 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2597 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2598 #endif 2599 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2600 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2601 finalizesym: 2602 c->singlemalloc = PETSC_FALSE; 2603 c->free_a = PETSC_TRUE; 2604 c->free_ij = PETSC_TRUE; 2605 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2606 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2607 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2608 PetscInt *d_i = c->i; 2609 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2610 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2611 ii = *Ccsr->row_offsets; 2612 jj = *Ccsr->column_indices; 2613 if (ciscompressed) d_i = c->compressedrow.i; 2614 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2615 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2616 } else { 2617 PetscInt *d_i = c->i; 2618 if (ciscompressed) d_i = c->compressedrow.i; 2619 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2620 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2621 } 2622 if (ciscompressed) { /* need to expand host row offsets */ 2623 PetscInt r = 0; 2624 c->i[0] = 0; 2625 for (k = 0; k < c->compressedrow.nrows; k++) { 2626 const PetscInt next = c->compressedrow.rindex[k]; 2627 const PetscInt old = c->compressedrow.i[k]; 2628 for (; r < next; r++) c->i[r+1] = old; 2629 } 2630 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2631 } 2632 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2633 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2634 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2635 c->maxnz = c->nz; 2636 c->nonzerorowcnt = 0; 2637 c->rmax = 0; 2638 for (k = 0; k < m; k++) { 2639 const PetscInt nn = c->i[k+1] - c->i[k]; 2640 c->ilen[k] = c->imax[k] = nn; 2641 c->nonzerorowcnt += (PetscInt)!!nn; 2642 c->rmax = PetscMax(c->rmax,nn); 2643 } 2644 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2645 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2646 Ccsr->num_entries = c->nz; 2647 2648 C->nonzerostate++; 2649 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2650 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2651 Ccusp->nonzerostate = C->nonzerostate; 2652 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2653 C->preallocated = PETSC_TRUE; 2654 C->assembled = PETSC_FALSE; 2655 C->was_assembled = PETSC_FALSE; 2656 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2657 mmdata->reusesym = PETSC_TRUE; 2658 C->offloadmask = PETSC_OFFLOAD_GPU; 2659 } 2660 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2661 PetscFunctionReturn(0); 2662 } 2663 2664 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2665 2666 /* handles sparse or dense B */ 2667 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2668 { 2669 Mat_Product *product = mat->product; 2670 PetscErrorCode ierr; 2671 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2672 2673 PetscFunctionBegin; 2674 MatCheckProduct(mat,1); 2675 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2676 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2677 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2678 } 2679 if (product->type == MATPRODUCT_ABC) { 2680 Ciscusp = PETSC_FALSE; 2681 if (!product->C->boundtocpu) { 2682 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2683 } 2684 } 2685 if (isdense) { 2686 switch (product->type) { 2687 case MATPRODUCT_AB: 2688 case MATPRODUCT_AtB: 2689 case MATPRODUCT_ABt: 2690 case MATPRODUCT_PtAP: 2691 case MATPRODUCT_RARt: 2692 if (product->A->boundtocpu) { 2693 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2694 } else { 2695 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2696 } 2697 break; 2698 case MATPRODUCT_ABC: 2699 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2700 break; 2701 default: 2702 break; 2703 } 2704 } else if (Biscusp && Ciscusp) { 2705 switch (product->type) { 2706 case MATPRODUCT_AB: 2707 case MATPRODUCT_AtB: 2708 case MATPRODUCT_ABt: 2709 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2710 break; 2711 case MATPRODUCT_PtAP: 2712 case MATPRODUCT_RARt: 2713 case MATPRODUCT_ABC: 2714 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2715 break; 2716 default: 2717 break; 2718 } 2719 } else { /* fallback for AIJ */ 2720 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2721 } 2722 PetscFunctionReturn(0); 2723 } 2724 2725 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2726 { 2727 PetscErrorCode ierr; 2728 2729 PetscFunctionBegin; 2730 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2731 PetscFunctionReturn(0); 2732 } 2733 2734 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2735 { 2736 PetscErrorCode ierr; 2737 2738 PetscFunctionBegin; 2739 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2740 PetscFunctionReturn(0); 2741 } 2742 2743 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2744 { 2745 PetscErrorCode ierr; 2746 2747 PetscFunctionBegin; 2748 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2749 PetscFunctionReturn(0); 2750 } 2751 2752 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2753 { 2754 PetscErrorCode ierr; 2755 2756 PetscFunctionBegin; 2757 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2758 PetscFunctionReturn(0); 2759 } 2760 2761 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2762 { 2763 PetscErrorCode ierr; 2764 2765 PetscFunctionBegin; 2766 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2767 PetscFunctionReturn(0); 2768 } 2769 2770 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2771 { 2772 int i = blockIdx.x*blockDim.x + threadIdx.x; 2773 if (i < n) y[idx[i]] += x[i]; 2774 } 2775 2776 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2777 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2778 { 2779 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2780 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2781 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2782 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2783 PetscErrorCode ierr; 2784 cusparseStatus_t stat; 2785 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2786 PetscBool compressed; 2787 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2788 PetscInt nx,ny; 2789 #endif 2790 2791 PetscFunctionBegin; 2792 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 2793 if (!a->nonzerorowcnt) { 2794 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2795 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2796 PetscFunctionReturn(0); 2797 } 2798 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2799 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2800 if (!trans) { 2801 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2802 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2803 } else { 2804 if (herm || !A->form_explicit_transpose) { 2805 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2806 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2807 } else { 2808 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);} 2809 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2810 } 2811 } 2812 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2813 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2814 2815 try { 2816 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2817 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2818 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2819 2820 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2821 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2822 /* z = A x + beta y. 2823 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2824 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2825 */ 2826 xptr = xarray; 2827 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2828 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2829 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2830 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2831 allocated to accommodate different uses. So we get the length info directly from mat. 2832 */ 2833 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2834 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2835 nx = mat->num_cols; 2836 ny = mat->num_rows; 2837 } 2838 #endif 2839 } else { 2840 /* z = A^T x + beta y 2841 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2842 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2843 */ 2844 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2845 dptr = zarray; 2846 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2847 if (compressed) { /* Scatter x to work vector */ 2848 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2849 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2850 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2851 VecCUDAEqualsReverse()); 2852 } 2853 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2854 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2855 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2856 nx = mat->num_rows; 2857 ny = mat->num_cols; 2858 } 2859 #endif 2860 } 2861 2862 /* csr_spmv does y = alpha op(A) x + beta y */ 2863 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2864 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2865 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2866 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2867 cudaError_t cerr; 2868 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2869 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2870 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2871 matstruct->matDescr, 2872 matstruct->cuSpMV[opA].vecXDescr, beta, 2873 matstruct->cuSpMV[opA].vecYDescr, 2874 cusparse_scalartype, 2875 cusparsestruct->spmvAlg, 2876 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2877 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2878 2879 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2880 } else { 2881 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2882 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2883 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2884 } 2885 2886 stat = cusparseSpMV(cusparsestruct->handle, opA, 2887 matstruct->alpha_one, 2888 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */ 2889 matstruct->cuSpMV[opA].vecXDescr, 2890 beta, 2891 matstruct->cuSpMV[opA].vecYDescr, 2892 cusparse_scalartype, 2893 cusparsestruct->spmvAlg, 2894 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2895 #else 2896 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2897 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2898 mat->num_rows, mat->num_cols, 2899 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2900 mat->values->data().get(), mat->row_offsets->data().get(), 2901 mat->column_indices->data().get(), xptr, beta, 2902 dptr);CHKERRCUSPARSE(stat); 2903 #endif 2904 } else { 2905 if (cusparsestruct->nrows) { 2906 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2907 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2908 #else 2909 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2910 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2911 matstruct->alpha_one, matstruct->descr, hybMat, 2912 xptr, beta, 2913 dptr);CHKERRCUSPARSE(stat); 2914 #endif 2915 } 2916 } 2917 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2918 2919 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2920 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2921 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2922 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2923 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2924 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2925 } 2926 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2927 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2928 } 2929 2930 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2931 if (compressed) { 2932 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2933 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 2934 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 2935 prevent that. So I just add a ScatterAdd kernel. 2936 */ 2937 #if 0 2938 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2939 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 2940 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2941 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2942 VecCUDAPlusEquals()); 2943 #else 2944 PetscInt n = matstruct->cprowIndices->size(); 2945 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 2946 #endif 2947 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2948 } 2949 } else { 2950 if (yy && yy != zz) { 2951 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2952 } 2953 } 2954 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2955 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2956 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2957 } catch(char *ex) { 2958 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2959 } 2960 if (yy) { 2961 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2962 } else { 2963 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2964 } 2965 PetscFunctionReturn(0); 2966 } 2967 2968 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2969 { 2970 PetscErrorCode ierr; 2971 2972 PetscFunctionBegin; 2973 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2974 PetscFunctionReturn(0); 2975 } 2976 2977 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2978 { 2979 PetscErrorCode ierr; 2980 PetscSplitCSRDataStructure *d_mat = NULL; 2981 PetscFunctionBegin; 2982 if (A->factortype == MAT_FACTOR_NONE) { 2983 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2984 } 2985 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2986 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2987 if (d_mat) { 2988 A->offloadmask = PETSC_OFFLOAD_GPU; 2989 } 2990 2991 PetscFunctionReturn(0); 2992 } 2993 2994 /* --------------------------------------------------------------------------------*/ 2995 /*@ 2996 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2997 (the default parallel PETSc format). This matrix will ultimately pushed down 2998 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2999 assembly performance the user should preallocate the matrix storage by setting 3000 the parameter nz (or the array nnz). By setting these parameters accurately, 3001 performance during matrix assembly can be increased by more than a factor of 50. 3002 3003 Collective 3004 3005 Input Parameters: 3006 + comm - MPI communicator, set to PETSC_COMM_SELF 3007 . m - number of rows 3008 . n - number of columns 3009 . nz - number of nonzeros per row (same for all rows) 3010 - nnz - array containing the number of nonzeros in the various rows 3011 (possibly different for each row) or NULL 3012 3013 Output Parameter: 3014 . A - the matrix 3015 3016 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3017 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3018 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3019 3020 Notes: 3021 If nnz is given then nz is ignored 3022 3023 The AIJ format (also called the Yale sparse matrix format or 3024 compressed row storage), is fully compatible with standard Fortran 77 3025 storage. That is, the stored row and column indices can begin at 3026 either one (as in Fortran) or zero. See the users' manual for details. 3027 3028 Specify the preallocated storage with either nz or nnz (not both). 3029 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3030 allocation. For large problems you MUST preallocate memory or you 3031 will get TERRIBLE performance, see the users' manual chapter on matrices. 3032 3033 By default, this format uses inodes (identical nodes) when possible, to 3034 improve numerical efficiency of matrix-vector products and solves. We 3035 search for consecutive rows with the same nonzero structure, thereby 3036 reusing matrix information to achieve increased efficiency. 3037 3038 Level: intermediate 3039 3040 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3041 @*/ 3042 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3043 { 3044 PetscErrorCode ierr; 3045 3046 PetscFunctionBegin; 3047 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3048 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3049 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3050 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3051 PetscFunctionReturn(0); 3052 } 3053 3054 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3055 { 3056 PetscErrorCode ierr; 3057 PetscSplitCSRDataStructure *d_mat = NULL; 3058 3059 PetscFunctionBegin; 3060 if (A->factortype == MAT_FACTOR_NONE) { 3061 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 3062 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 3063 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3064 } else { 3065 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3066 } 3067 if (d_mat) { 3068 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3069 cudaError_t err; 3070 PetscSplitCSRDataStructure h_mat; 3071 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 3072 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 3073 if (a->compressedrow.use) { 3074 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 3075 } 3076 err = cudaFree(d_mat);CHKERRCUDA(err); 3077 } 3078 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3079 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3080 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3081 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3082 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3083 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3084 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3085 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3086 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3087 PetscFunctionReturn(0); 3088 } 3089 3090 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3091 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3092 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3093 { 3094 PetscErrorCode ierr; 3095 3096 PetscFunctionBegin; 3097 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3098 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3099 PetscFunctionReturn(0); 3100 } 3101 3102 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3103 { 3104 PetscErrorCode ierr; 3105 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3106 Mat_SeqAIJCUSPARSE *cy; 3107 Mat_SeqAIJCUSPARSE *cx; 3108 PetscScalar *ay; 3109 const PetscScalar *ax; 3110 CsrMatrix *csry,*csrx; 3111 3112 PetscFunctionBegin; 3113 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3114 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3115 if (X->ops->axpy != Y->ops->axpy) { 3116 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3117 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3118 PetscFunctionReturn(0); 3119 } 3120 /* if we are here, it means both matrices are bound to GPU */ 3121 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3122 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3123 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3124 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3125 csry = (CsrMatrix*)cy->mat->mat; 3126 csrx = (CsrMatrix*)cx->mat->mat; 3127 /* see if we can turn this into a cublas axpy */ 3128 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3129 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3130 if (eq) { 3131 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3132 } 3133 if (eq) str = SAME_NONZERO_PATTERN; 3134 } 3135 /* spgeam is buggy with one column */ 3136 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3137 3138 if (str == SUBSET_NONZERO_PATTERN) { 3139 cusparseStatus_t stat; 3140 PetscScalar b = 1.0; 3141 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3142 size_t bufferSize; 3143 void *buffer; 3144 cudaError_t cerr; 3145 #endif 3146 3147 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3148 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3149 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3150 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3151 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3152 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3153 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3154 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3155 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3156 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3157 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3158 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3159 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3160 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3161 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3162 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3163 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3164 #else 3165 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3166 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3167 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3168 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3169 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3170 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3171 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3172 #endif 3173 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3174 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3175 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3176 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3177 } else if (str == SAME_NONZERO_PATTERN) { 3178 cublasHandle_t cublasv2handle; 3179 cublasStatus_t berr; 3180 PetscBLASInt one = 1, bnz = 1; 3181 3182 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3183 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3184 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3185 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3186 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3187 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3188 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3189 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3190 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3191 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3192 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3193 } else { 3194 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3195 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3196 } 3197 PetscFunctionReturn(0); 3198 } 3199 3200 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3201 { 3202 PetscErrorCode ierr; 3203 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3204 PetscScalar *ay; 3205 cublasHandle_t cublasv2handle; 3206 cublasStatus_t berr; 3207 PetscBLASInt one = 1, bnz = 1; 3208 3209 PetscFunctionBegin; 3210 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3211 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3212 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3213 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3214 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3215 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3216 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3217 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3218 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3219 PetscFunctionReturn(0); 3220 } 3221 3222 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3223 { 3224 PetscErrorCode ierr; 3225 PetscBool both = PETSC_FALSE; 3226 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3227 3228 PetscFunctionBegin; 3229 if (A->factortype == MAT_FACTOR_NONE) { 3230 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3231 if (spptr->mat) { 3232 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3233 if (matrix->values) { 3234 both = PETSC_TRUE; 3235 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3236 } 3237 } 3238 if (spptr->matTranspose) { 3239 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3240 if (matrix->values) { 3241 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3242 } 3243 } 3244 } 3245 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3246 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3247 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3248 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3249 else A->offloadmask = PETSC_OFFLOAD_CPU; 3250 3251 PetscFunctionReturn(0); 3252 } 3253 3254 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3255 { 3256 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3257 PetscErrorCode ierr; 3258 3259 PetscFunctionBegin; 3260 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3261 if (flg) { 3262 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3263 3264 A->ops->scale = MatScale_SeqAIJ; 3265 A->ops->axpy = MatAXPY_SeqAIJ; 3266 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3267 A->ops->mult = MatMult_SeqAIJ; 3268 A->ops->multadd = MatMultAdd_SeqAIJ; 3269 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3270 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3271 A->ops->multhermitiantranspose = NULL; 3272 A->ops->multhermitiantransposeadd = NULL; 3273 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3274 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3275 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3276 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3277 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3278 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3279 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3280 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3281 } else { 3282 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3283 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3284 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3285 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3286 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3287 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3288 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3289 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3290 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3291 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3292 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3293 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3294 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3295 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3296 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3297 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3298 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3299 } 3300 A->boundtocpu = flg; 3301 a->inode.use = flg; 3302 PetscFunctionReturn(0); 3303 } 3304 3305 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3306 { 3307 PetscErrorCode ierr; 3308 cusparseStatus_t stat; 3309 Mat B; 3310 3311 PetscFunctionBegin; 3312 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3313 if (reuse == MAT_INITIAL_MATRIX) { 3314 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3315 } else if (reuse == MAT_REUSE_MATRIX) { 3316 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3317 } 3318 B = *newmat; 3319 3320 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3321 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3322 3323 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3324 if (B->factortype == MAT_FACTOR_NONE) { 3325 Mat_SeqAIJCUSPARSE *spptr; 3326 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3327 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3328 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3329 spptr->format = MAT_CUSPARSE_CSR; 3330 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3331 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3332 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3333 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3334 #endif 3335 B->spptr = spptr; 3336 } else { 3337 Mat_SeqAIJCUSPARSETriFactors *spptr; 3338 3339 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3340 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3341 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3342 B->spptr = spptr; 3343 } 3344 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3345 } 3346 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3347 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3348 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3349 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3350 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3351 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3352 3353 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3354 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3355 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3356 PetscFunctionReturn(0); 3357 } 3358 3359 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3360 { 3361 PetscErrorCode ierr; 3362 3363 PetscFunctionBegin; 3364 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3365 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3366 PetscFunctionReturn(0); 3367 } 3368 3369 /*MC 3370 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3371 3372 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3373 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3374 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3375 3376 Options Database Keys: 3377 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3378 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3379 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3380 3381 Level: beginner 3382 3383 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3384 M*/ 3385 3386 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3387 3388 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3389 { 3390 PetscErrorCode ierr; 3391 3392 PetscFunctionBegin; 3393 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 3394 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3395 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3396 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3397 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3398 3399 PetscFunctionReturn(0); 3400 } 3401 3402 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3403 { 3404 PetscErrorCode ierr; 3405 cusparseStatus_t stat; 3406 3407 PetscFunctionBegin; 3408 if (*cusparsestruct) { 3409 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3410 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3411 delete (*cusparsestruct)->workVector; 3412 delete (*cusparsestruct)->rowoffsets_gpu; 3413 delete (*cusparsestruct)->cooPerm; 3414 delete (*cusparsestruct)->cooPerm_a; 3415 delete (*cusparsestruct)->csr2csc_i; 3416 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3417 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3418 } 3419 PetscFunctionReturn(0); 3420 } 3421 3422 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3423 { 3424 PetscFunctionBegin; 3425 if (*mat) { 3426 delete (*mat)->values; 3427 delete (*mat)->column_indices; 3428 delete (*mat)->row_offsets; 3429 delete *mat; 3430 *mat = 0; 3431 } 3432 PetscFunctionReturn(0); 3433 } 3434 3435 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3436 { 3437 cusparseStatus_t stat; 3438 PetscErrorCode ierr; 3439 3440 PetscFunctionBegin; 3441 if (*trifactor) { 3442 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3443 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3444 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3445 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3446 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3447 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3448 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3449 #endif 3450 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3451 } 3452 PetscFunctionReturn(0); 3453 } 3454 3455 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3456 { 3457 CsrMatrix *mat; 3458 cusparseStatus_t stat; 3459 cudaError_t err; 3460 3461 PetscFunctionBegin; 3462 if (*matstruct) { 3463 if ((*matstruct)->mat) { 3464 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3465 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3466 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3467 #else 3468 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3469 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3470 #endif 3471 } else { 3472 mat = (CsrMatrix*)(*matstruct)->mat; 3473 CsrMatrix_Destroy(&mat); 3474 } 3475 } 3476 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3477 delete (*matstruct)->cprowIndices; 3478 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3479 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3480 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3481 3482 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3483 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3484 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3485 for (int i=0; i<3; i++) { 3486 if (mdata->cuSpMV[i].initialized) { 3487 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3488 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3489 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3490 } 3491 } 3492 #endif 3493 delete *matstruct; 3494 *matstruct = NULL; 3495 } 3496 PetscFunctionReturn(0); 3497 } 3498 3499 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3500 { 3501 PetscErrorCode ierr; 3502 3503 PetscFunctionBegin; 3504 if (*trifactors) { 3505 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3506 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3507 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3508 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3509 delete (*trifactors)->rpermIndices; 3510 delete (*trifactors)->cpermIndices; 3511 delete (*trifactors)->workVector; 3512 (*trifactors)->rpermIndices = NULL; 3513 (*trifactors)->cpermIndices = NULL; 3514 (*trifactors)->workVector = NULL; 3515 if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3516 if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3517 (*trifactors)->init_dev_prop = PETSC_FALSE; 3518 } 3519 PetscFunctionReturn(0); 3520 } 3521 3522 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3523 { 3524 PetscErrorCode ierr; 3525 cusparseHandle_t handle; 3526 cusparseStatus_t stat; 3527 3528 PetscFunctionBegin; 3529 if (*trifactors) { 3530 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3531 if (handle = (*trifactors)->handle) { 3532 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3533 } 3534 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3535 } 3536 PetscFunctionReturn(0); 3537 } 3538 3539 struct IJCompare 3540 { 3541 __host__ __device__ 3542 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3543 { 3544 if (t1.get<0>() < t2.get<0>()) return true; 3545 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3546 return false; 3547 } 3548 }; 3549 3550 struct IJEqual 3551 { 3552 __host__ __device__ 3553 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3554 { 3555 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3556 return true; 3557 } 3558 }; 3559 3560 struct IJDiff 3561 { 3562 __host__ __device__ 3563 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3564 { 3565 return t1 == t2 ? 0 : 1; 3566 } 3567 }; 3568 3569 struct IJSum 3570 { 3571 __host__ __device__ 3572 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3573 { 3574 return t1||t2; 3575 } 3576 }; 3577 3578 #include <thrust/iterator/discard_iterator.h> 3579 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3580 { 3581 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3582 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3583 THRUSTARRAY *cooPerm_v = NULL; 3584 thrust::device_ptr<const PetscScalar> d_v; 3585 CsrMatrix *matrix; 3586 PetscErrorCode ierr; 3587 PetscInt n; 3588 3589 PetscFunctionBegin; 3590 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3591 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3592 if (!cusp->cooPerm) { 3593 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3594 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3595 PetscFunctionReturn(0); 3596 } 3597 matrix = (CsrMatrix*)cusp->mat->mat; 3598 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3599 if (!v) { 3600 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3601 goto finalize; 3602 } 3603 n = cusp->cooPerm->size(); 3604 if (isCudaMem(v)) { 3605 d_v = thrust::device_pointer_cast(v); 3606 } else { 3607 cooPerm_v = new THRUSTARRAY(n); 3608 cooPerm_v->assign(v,v+n); 3609 d_v = cooPerm_v->data(); 3610 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3611 } 3612 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3613 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3614 if (cusp->cooPerm_a) { 3615 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3616 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3617 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3618 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3619 delete cooPerm_w; 3620 } else { 3621 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3622 matrix->values->begin())); 3623 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3624 matrix->values->end())); 3625 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 3626 } 3627 } else { 3628 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3629 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3630 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3631 } else { 3632 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3633 matrix->values->begin())); 3634 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3635 matrix->values->end())); 3636 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3637 } 3638 } 3639 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3640 finalize: 3641 delete cooPerm_v; 3642 A->offloadmask = PETSC_OFFLOAD_GPU; 3643 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3644 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3645 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3646 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3647 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3648 a->reallocs = 0; 3649 A->info.mallocs += 0; 3650 A->info.nz_unneeded = 0; 3651 A->assembled = A->was_assembled = PETSC_TRUE; 3652 A->num_ass++; 3653 PetscFunctionReturn(0); 3654 } 3655 3656 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3657 { 3658 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3659 PetscErrorCode ierr; 3660 3661 PetscFunctionBegin; 3662 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3663 if (!cusp) PetscFunctionReturn(0); 3664 if (destroy) { 3665 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3666 delete cusp->csr2csc_i; 3667 cusp->csr2csc_i = NULL; 3668 } 3669 A->transupdated = PETSC_FALSE; 3670 PetscFunctionReturn(0); 3671 } 3672 3673 #include <thrust/binary_search.h> 3674 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3675 { 3676 PetscErrorCode ierr; 3677 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3678 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3679 PetscInt cooPerm_n, nzr = 0; 3680 cudaError_t cerr; 3681 3682 PetscFunctionBegin; 3683 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3684 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3685 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3686 if (n != cooPerm_n) { 3687 delete cusp->cooPerm; 3688 delete cusp->cooPerm_a; 3689 cusp->cooPerm = NULL; 3690 cusp->cooPerm_a = NULL; 3691 } 3692 if (n) { 3693 THRUSTINTARRAY d_i(n); 3694 THRUSTINTARRAY d_j(n); 3695 THRUSTINTARRAY ii(A->rmap->n); 3696 3697 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3698 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3699 3700 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3701 d_i.assign(coo_i,coo_i+n); 3702 d_j.assign(coo_j,coo_j+n); 3703 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3704 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3705 3706 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3707 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3708 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 3709 *cusp->cooPerm_a = d_i; 3710 THRUSTINTARRAY w = d_j; 3711 3712 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 3713 if (nekey == ekey) { /* all entries are unique */ 3714 delete cusp->cooPerm_a; 3715 cusp->cooPerm_a = NULL; 3716 } else { /* I couldn't come up with a more elegant algorithm */ 3717 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 3718 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 3719 (*cusp->cooPerm_a)[0] = 0; 3720 w[0] = 0; 3721 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 3722 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 3723 } 3724 thrust::counting_iterator<PetscInt> search_begin(0); 3725 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 3726 search_begin, search_begin + A->rmap->n, 3727 ii.begin()); 3728 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3729 3730 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3731 a->singlemalloc = PETSC_FALSE; 3732 a->free_a = PETSC_TRUE; 3733 a->free_ij = PETSC_TRUE; 3734 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3735 a->i[0] = 0; 3736 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3737 a->nz = a->maxnz = a->i[A->rmap->n]; 3738 a->rmax = 0; 3739 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3740 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3741 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3742 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3743 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3744 for (PetscInt i = 0; i < A->rmap->n; i++) { 3745 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3746 nzr += (PetscInt)!!(nnzr); 3747 a->ilen[i] = a->imax[i] = nnzr; 3748 a->rmax = PetscMax(a->rmax,nnzr); 3749 } 3750 a->nonzerorowcnt = nzr; 3751 A->preallocated = PETSC_TRUE; 3752 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3753 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3754 } else { 3755 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3756 } 3757 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3758 3759 /* We want to allocate the CUSPARSE struct for matvec now. 3760 The code is so convoluted now that I prefer to copy zeros */ 3761 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3762 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3763 A->offloadmask = PETSC_OFFLOAD_CPU; 3764 A->nonzerostate++; 3765 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3766 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 3767 3768 A->assembled = PETSC_FALSE; 3769 A->was_assembled = PETSC_FALSE; 3770 PetscFunctionReturn(0); 3771 } 3772 3773 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 3774 { 3775 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3776 CsrMatrix *csr; 3777 PetscErrorCode ierr; 3778 3779 PetscFunctionBegin; 3780 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3781 PetscValidPointer(a,2); 3782 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3783 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3784 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3785 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3786 csr = (CsrMatrix*)cusp->mat->mat; 3787 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3788 *a = csr->values->data().get(); 3789 PetscFunctionReturn(0); 3790 } 3791 3792 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 3793 { 3794 PetscFunctionBegin; 3795 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3796 PetscValidPointer(a,2); 3797 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3798 *a = NULL; 3799 PetscFunctionReturn(0); 3800 } 3801 3802 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 3803 { 3804 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3805 CsrMatrix *csr; 3806 PetscErrorCode ierr; 3807 3808 PetscFunctionBegin; 3809 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3810 PetscValidPointer(a,2); 3811 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3812 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3813 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3814 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3815 csr = (CsrMatrix*)cusp->mat->mat; 3816 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3817 *a = csr->values->data().get(); 3818 A->offloadmask = PETSC_OFFLOAD_GPU; 3819 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3820 PetscFunctionReturn(0); 3821 } 3822 3823 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 3824 { 3825 PetscErrorCode ierr; 3826 3827 PetscFunctionBegin; 3828 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3829 PetscValidPointer(a,2); 3830 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3831 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3832 *a = NULL; 3833 PetscFunctionReturn(0); 3834 } 3835 3836 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 3837 { 3838 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3839 CsrMatrix *csr; 3840 PetscErrorCode ierr; 3841 3842 PetscFunctionBegin; 3843 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3844 PetscValidPointer(a,2); 3845 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3846 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3847 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3848 csr = (CsrMatrix*)cusp->mat->mat; 3849 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3850 *a = csr->values->data().get(); 3851 A->offloadmask = PETSC_OFFLOAD_GPU; 3852 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3853 PetscFunctionReturn(0); 3854 } 3855 3856 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 3857 { 3858 PetscErrorCode ierr; 3859 3860 PetscFunctionBegin; 3861 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3862 PetscValidPointer(a,2); 3863 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3864 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3865 *a = NULL; 3866 PetscFunctionReturn(0); 3867 } 3868 3869 struct IJCompare4 3870 { 3871 __host__ __device__ 3872 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 3873 { 3874 if (t1.get<0>() < t2.get<0>()) return true; 3875 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3876 return false; 3877 } 3878 }; 3879 3880 struct Shift 3881 { 3882 int _shift; 3883 3884 Shift(int shift) : _shift(shift) {} 3885 __host__ __device__ 3886 inline int operator() (const int &c) 3887 { 3888 return c + _shift; 3889 } 3890 }; 3891 3892 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */ 3893 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 3894 { 3895 PetscErrorCode ierr; 3896 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 3897 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 3898 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 3899 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3900 PetscInt Annz,Bnnz; 3901 cusparseStatus_t stat; 3902 PetscInt i,m,n,zero = 0; 3903 cudaError_t cerr; 3904 3905 PetscFunctionBegin; 3906 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3907 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 3908 PetscValidPointer(C,4); 3909 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3910 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 3911 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 3912 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 3913 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3914 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3915 if (reuse == MAT_INITIAL_MATRIX) { 3916 m = A->rmap->n; 3917 n = A->cmap->n + B->cmap->n; 3918 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 3919 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 3920 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3921 c = (Mat_SeqAIJ*)(*C)->data; 3922 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3923 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3924 Ccsr = new CsrMatrix; 3925 Cmat->cprowIndices = NULL; 3926 c->compressedrow.use = PETSC_FALSE; 3927 c->compressedrow.nrows = 0; 3928 c->compressedrow.i = NULL; 3929 c->compressedrow.rindex = NULL; 3930 Ccusp->workVector = NULL; 3931 Ccusp->nrows = m; 3932 Ccusp->mat = Cmat; 3933 Ccusp->mat->mat = Ccsr; 3934 Ccsr->num_rows = m; 3935 Ccsr->num_cols = n; 3936 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 3937 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3938 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3939 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3940 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3941 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3942 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3943 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3944 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3945 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3946 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 3947 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 3948 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 3949 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3950 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3951 3952 Acsr = (CsrMatrix*)Acusp->mat->mat; 3953 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 3954 Annz = (PetscInt)Acsr->column_indices->size(); 3955 Bnnz = (PetscInt)Bcsr->column_indices->size(); 3956 c->nz = Annz + Bnnz; 3957 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 3958 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3959 Ccsr->values = new THRUSTARRAY(c->nz); 3960 Ccsr->num_entries = c->nz; 3961 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 3962 if (c->nz) { 3963 auto Acoo = new THRUSTINTARRAY32(Annz); 3964 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 3965 auto Ccoo = new THRUSTINTARRAY32(c->nz); 3966 THRUSTINTARRAY32 *Aroff,*Broff; 3967 3968 if (a->compressedrow.use) { /* need full row offset */ 3969 if (!Acusp->rowoffsets_gpu) { 3970 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 3971 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 3972 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3973 } 3974 Aroff = Acusp->rowoffsets_gpu; 3975 } else Aroff = Acsr->row_offsets; 3976 if (b->compressedrow.use) { /* need full row offset */ 3977 if (!Bcusp->rowoffsets_gpu) { 3978 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3979 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 3980 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3981 } 3982 Broff = Bcusp->rowoffsets_gpu; 3983 } else Broff = Bcsr->row_offsets; 3984 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3985 stat = cusparseXcsr2coo(Acusp->handle, 3986 Aroff->data().get(), 3987 Annz, 3988 m, 3989 Acoo->data().get(), 3990 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3991 stat = cusparseXcsr2coo(Bcusp->handle, 3992 Broff->data().get(), 3993 Bnnz, 3994 m, 3995 Bcoo->data().get(), 3996 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3997 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 3998 auto Aperm = thrust::make_constant_iterator(1); 3999 auto Bperm = thrust::make_constant_iterator(0); 4000 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4001 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4002 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4003 #else 4004 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4005 auto Bcib = Bcsr->column_indices->begin(); 4006 auto Bcie = Bcsr->column_indices->end(); 4007 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4008 #endif 4009 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4010 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4011 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4012 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4013 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4014 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4015 auto p1 = Ccusp->cooPerm->begin(); 4016 auto p2 = Ccusp->cooPerm->begin(); 4017 thrust::advance(p2,Annz); 4018 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4019 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4020 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4021 #endif 4022 auto cci = thrust::make_counting_iterator(zero); 4023 auto cce = thrust::make_counting_iterator(c->nz); 4024 #if 0 //Errors on SUMMIT cuda 11.1.0 4025 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4026 #else 4027 auto pred = thrust::identity<int>(); 4028 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4029 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4030 #endif 4031 stat = cusparseXcoo2csr(Ccusp->handle, 4032 Ccoo->data().get(), 4033 c->nz, 4034 m, 4035 Ccsr->row_offsets->data().get(), 4036 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4037 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4038 delete wPerm; 4039 delete Acoo; 4040 delete Bcoo; 4041 delete Ccoo; 4042 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4043 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4044 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4045 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4046 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4047 #endif 4048 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4049 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4050 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4051 CsrMatrix *CcsrT = new CsrMatrix; 4052 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4053 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4054 4055 (*C)->form_explicit_transpose = PETSC_TRUE; 4056 (*C)->transupdated = PETSC_TRUE; 4057 Ccusp->rowoffsets_gpu = NULL; 4058 CmatT->cprowIndices = NULL; 4059 CmatT->mat = CcsrT; 4060 CcsrT->num_rows = n; 4061 CcsrT->num_cols = m; 4062 CcsrT->num_entries = c->nz; 4063 4064 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4065 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4066 CcsrT->values = new THRUSTARRAY(c->nz); 4067 4068 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4069 auto rT = CcsrT->row_offsets->begin(); 4070 if (AT) { 4071 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4072 thrust::advance(rT,-1); 4073 } 4074 if (BT) { 4075 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4076 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4077 thrust::copy(titb,tite,rT); 4078 } 4079 auto cT = CcsrT->column_indices->begin(); 4080 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4081 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4082 auto vT = CcsrT->values->begin(); 4083 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4084 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4085 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4086 4087 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4088 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4089 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4090 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4091 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4092 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4093 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4094 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4095 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4096 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4097 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4098 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4099 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4100 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4101 #endif 4102 Ccusp->matTranspose = CmatT; 4103 } 4104 } 4105 4106 c->singlemalloc = PETSC_FALSE; 4107 c->free_a = PETSC_TRUE; 4108 c->free_ij = PETSC_TRUE; 4109 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4110 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4111 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4112 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4113 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4114 ii = *Ccsr->row_offsets; 4115 jj = *Ccsr->column_indices; 4116 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4117 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4118 } else { 4119 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4120 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4121 } 4122 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4123 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4124 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4125 c->maxnz = c->nz; 4126 c->nonzerorowcnt = 0; 4127 c->rmax = 0; 4128 for (i = 0; i < m; i++) { 4129 const PetscInt nn = c->i[i+1] - c->i[i]; 4130 c->ilen[i] = c->imax[i] = nn; 4131 c->nonzerorowcnt += (PetscInt)!!nn; 4132 c->rmax = PetscMax(c->rmax,nn); 4133 } 4134 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4135 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4136 (*C)->nonzerostate++; 4137 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4138 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4139 Ccusp->nonzerostate = (*C)->nonzerostate; 4140 (*C)->preallocated = PETSC_TRUE; 4141 } else { 4142 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4143 c = (Mat_SeqAIJ*)(*C)->data; 4144 if (c->nz) { 4145 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4146 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4147 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4148 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4149 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4150 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4151 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4152 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4153 Acsr = (CsrMatrix*)Acusp->mat->mat; 4154 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4155 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4156 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4157 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4158 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4159 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4160 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4161 auto pmid = Ccusp->cooPerm->begin(); 4162 thrust::advance(pmid,Acsr->num_entries); 4163 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4164 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4165 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4166 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4167 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4168 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4169 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4170 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4171 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4172 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4173 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4174 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4175 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4176 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4177 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4178 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4179 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4180 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4181 auto vT = CcsrT->values->begin(); 4182 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4183 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4184 (*C)->transupdated = PETSC_TRUE; 4185 } 4186 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4187 } 4188 } 4189 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4190 (*C)->assembled = PETSC_TRUE; 4191 (*C)->was_assembled = PETSC_FALSE; 4192 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4193 PetscFunctionReturn(0); 4194 } 4195 4196 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4197 { 4198 PetscErrorCode ierr; 4199 bool dmem; 4200 const PetscScalar *av; 4201 cudaError_t cerr; 4202 4203 PetscFunctionBegin; 4204 dmem = isCudaMem(v); 4205 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4206 if (n && idx) { 4207 THRUSTINTARRAY widx(n); 4208 widx.assign(idx,idx+n); 4209 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4210 4211 THRUSTARRAY *w = NULL; 4212 thrust::device_ptr<PetscScalar> dv; 4213 if (dmem) { 4214 dv = thrust::device_pointer_cast(v); 4215 } else { 4216 w = new THRUSTARRAY(n); 4217 dv = w->data(); 4218 } 4219 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4220 4221 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4222 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4223 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4224 if (w) { 4225 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4226 } 4227 delete w; 4228 } else { 4229 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4230 } 4231 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4232 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4233 PetscFunctionReturn(0); 4234 } 4235