1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/async/for_each.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 76 77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 82 83 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 85 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 86 87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 89 90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 91 92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 93 { 94 cusparseStatus_t stat; 95 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 96 97 PetscFunctionBegin; 98 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 99 cusparsestruct->stream = stream; 100 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 101 PetscFunctionReturn(0); 102 } 103 104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 105 { 106 cusparseStatus_t stat; 107 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 108 109 PetscFunctionBegin; 110 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 111 if (cusparsestruct->handle != handle) { 112 if (cusparsestruct->handle) { 113 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 114 } 115 cusparsestruct->handle = handle; 116 } 117 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 118 PetscFunctionReturn(0); 119 } 120 121 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 122 { 123 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 124 PetscBool flg; 125 PetscErrorCode ierr; 126 127 PetscFunctionBegin; 128 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 129 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 130 if (cusparsestruct->handle) cusparsestruct->handle = 0; 131 PetscFunctionReturn(0); 132 } 133 134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 135 { 136 PetscFunctionBegin; 137 *type = MATSOLVERCUSPARSE; 138 PetscFunctionReturn(0); 139 } 140 141 /*MC 142 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 143 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 144 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 145 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 146 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 147 algorithms are not recommended. This class does NOT support direct solver operations. 148 149 Level: beginner 150 151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 152 M*/ 153 154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 155 { 156 PetscErrorCode ierr; 157 PetscInt n = A->rmap->n; 158 159 PetscFunctionBegin; 160 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 161 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 162 (*B)->factortype = ftype; 163 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 164 165 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 166 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 167 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 168 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 169 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 170 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 171 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 172 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 173 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 174 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 175 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 176 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 177 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 178 179 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 180 (*B)->canuseordering = PETSC_TRUE; 181 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 182 PetscFunctionReturn(0); 183 } 184 185 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 186 { 187 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 188 189 PetscFunctionBegin; 190 switch (op) { 191 case MAT_CUSPARSE_MULT: 192 cusparsestruct->format = format; 193 break; 194 case MAT_CUSPARSE_ALL: 195 cusparsestruct->format = format; 196 break; 197 default: 198 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 199 } 200 PetscFunctionReturn(0); 201 } 202 203 /*@ 204 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 205 operation. Only the MatMult operation can use different GPU storage formats 206 for MPIAIJCUSPARSE matrices. 207 Not Collective 208 209 Input Parameters: 210 + A - Matrix of type SEQAIJCUSPARSE 211 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 212 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 213 214 Output Parameter: 215 216 Level: intermediate 217 218 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 219 @*/ 220 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 221 { 222 PetscErrorCode ierr; 223 224 PetscFunctionBegin; 225 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 226 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 227 PetscFunctionReturn(0); 228 } 229 230 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 231 { 232 PetscErrorCode ierr; 233 234 PetscFunctionBegin; 235 switch (op) { 236 case MAT_FORM_EXPLICIT_TRANSPOSE: 237 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 238 if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 239 A->form_explicit_transpose = flg; 240 break; 241 default: 242 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 243 break; 244 } 245 PetscFunctionReturn(0); 246 } 247 248 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 249 250 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 251 { 252 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 253 IS isrow = b->row,iscol = b->col; 254 PetscBool row_identity,col_identity; 255 PetscErrorCode ierr; 256 257 PetscFunctionBegin; 258 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 259 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 260 B->offloadmask = PETSC_OFFLOAD_CPU; 261 /* determine which version of MatSolve needs to be used. */ 262 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 263 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 264 if (row_identity && col_identity) { 265 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 266 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 267 B->ops->matsolve = NULL; 268 B->ops->matsolvetranspose = NULL; 269 } else { 270 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 271 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 272 B->ops->matsolve = NULL; 273 B->ops->matsolvetranspose = NULL; 274 } 275 276 /* get the triangular factors */ 277 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 278 PetscFunctionReturn(0); 279 } 280 281 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 282 { 283 PetscErrorCode ierr; 284 MatCUSPARSEStorageFormat format; 285 PetscBool flg; 286 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 287 288 PetscFunctionBegin; 289 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 290 if (A->factortype == MAT_FACTOR_NONE) { 291 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 292 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 293 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 294 295 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 296 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 297 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 298 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 299 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 300 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 301 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 302 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 303 304 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 305 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 306 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 307 308 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 309 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 310 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 311 #endif 312 } 313 ierr = PetscOptionsTail();CHKERRQ(ierr); 314 PetscFunctionReturn(0); 315 } 316 317 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 318 { 319 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 320 PetscErrorCode ierr; 321 322 PetscFunctionBegin; 323 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 324 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 325 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 326 PetscFunctionReturn(0); 327 } 328 329 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 330 { 331 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 332 PetscErrorCode ierr; 333 334 PetscFunctionBegin; 335 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 336 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 337 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 338 PetscFunctionReturn(0); 339 } 340 341 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 342 { 343 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 344 PetscErrorCode ierr; 345 346 PetscFunctionBegin; 347 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 348 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 349 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 350 PetscFunctionReturn(0); 351 } 352 353 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 354 { 355 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 356 PetscErrorCode ierr; 357 358 PetscFunctionBegin; 359 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 360 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 361 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 362 PetscFunctionReturn(0); 363 } 364 365 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 366 { 367 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 368 PetscInt n = A->rmap->n; 369 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 370 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 371 cusparseStatus_t stat; 372 const PetscInt *ai = a->i,*aj = a->j,*vi; 373 const MatScalar *aa = a->a,*v; 374 PetscInt *AiLo, *AjLo; 375 PetscInt i,nz, nzLower, offset, rowOffset; 376 PetscErrorCode ierr; 377 cudaError_t cerr; 378 379 PetscFunctionBegin; 380 if (!n) PetscFunctionReturn(0); 381 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 382 try { 383 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 384 nzLower=n+ai[n]-ai[1]; 385 if (!loTriFactor) { 386 PetscScalar *AALo; 387 388 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 389 390 /* Allocate Space for the lower triangular matrix */ 391 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 392 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 393 394 /* Fill the lower triangular matrix */ 395 AiLo[0] = (PetscInt) 0; 396 AiLo[n] = nzLower; 397 AjLo[0] = (PetscInt) 0; 398 AALo[0] = (MatScalar) 1.0; 399 v = aa; 400 vi = aj; 401 offset = 1; 402 rowOffset= 1; 403 for (i=1; i<n; i++) { 404 nz = ai[i+1] - ai[i]; 405 /* additional 1 for the term on the diagonal */ 406 AiLo[i] = rowOffset; 407 rowOffset += nz+1; 408 409 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 410 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 411 412 offset += nz; 413 AjLo[offset] = (PetscInt) i; 414 AALo[offset] = (MatScalar) 1.0; 415 offset += 1; 416 417 v += nz; 418 vi += nz; 419 } 420 421 /* allocate space for the triangular factor information */ 422 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 423 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 424 /* Create the matrix description */ 425 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 426 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 427 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 428 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 429 #else 430 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 431 #endif 432 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 433 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 434 435 /* set the operation */ 436 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 437 438 /* set the matrix */ 439 loTriFactor->csrMat = new CsrMatrix; 440 loTriFactor->csrMat->num_rows = n; 441 loTriFactor->csrMat->num_cols = n; 442 loTriFactor->csrMat->num_entries = nzLower; 443 444 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 445 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 446 447 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 448 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 449 450 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 451 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 452 453 /* Create the solve analysis information */ 454 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 455 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 456 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 457 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 458 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 459 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 460 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 461 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 462 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 463 #endif 464 465 /* perform the solve analysis */ 466 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 467 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 468 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 469 loTriFactor->csrMat->column_indices->data().get(), 470 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 471 loTriFactor->solveInfo, 472 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 473 #else 474 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 475 #endif 476 cerr = WaitForCUDA();CHKERRCUDA(cerr); 477 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 478 479 /* assign the pointer */ 480 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 481 loTriFactor->AA_h = AALo; 482 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 483 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 484 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 485 } else { /* update values only */ 486 if (!loTriFactor->AA_h) { 487 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 488 } 489 /* Fill the lower triangular matrix */ 490 loTriFactor->AA_h[0] = 1.0; 491 v = aa; 492 vi = aj; 493 offset = 1; 494 for (i=1; i<n; i++) { 495 nz = ai[i+1] - ai[i]; 496 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 497 offset += nz; 498 loTriFactor->AA_h[offset] = 1.0; 499 offset += 1; 500 v += nz; 501 } 502 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 503 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 504 } 505 } catch(char *ex) { 506 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 507 } 508 } 509 PetscFunctionReturn(0); 510 } 511 512 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 513 { 514 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 515 PetscInt n = A->rmap->n; 516 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 517 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 518 cusparseStatus_t stat; 519 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 520 const MatScalar *aa = a->a,*v; 521 PetscInt *AiUp, *AjUp; 522 PetscInt i,nz, nzUpper, offset; 523 PetscErrorCode ierr; 524 cudaError_t cerr; 525 526 PetscFunctionBegin; 527 if (!n) PetscFunctionReturn(0); 528 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 529 try { 530 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 531 nzUpper = adiag[0]-adiag[n]; 532 if (!upTriFactor) { 533 PetscScalar *AAUp; 534 535 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 536 537 /* Allocate Space for the upper triangular matrix */ 538 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 539 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 540 541 /* Fill the upper triangular matrix */ 542 AiUp[0]=(PetscInt) 0; 543 AiUp[n]=nzUpper; 544 offset = nzUpper; 545 for (i=n-1; i>=0; i--) { 546 v = aa + adiag[i+1] + 1; 547 vi = aj + adiag[i+1] + 1; 548 549 /* number of elements NOT on the diagonal */ 550 nz = adiag[i] - adiag[i+1]-1; 551 552 /* decrement the offset */ 553 offset -= (nz+1); 554 555 /* first, set the diagonal elements */ 556 AjUp[offset] = (PetscInt) i; 557 AAUp[offset] = (MatScalar)1./v[nz]; 558 AiUp[i] = AiUp[i+1] - (nz+1); 559 560 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 561 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 562 } 563 564 /* allocate space for the triangular factor information */ 565 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 566 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 567 568 /* Create the matrix description */ 569 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 570 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 571 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 572 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 573 #else 574 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 575 #endif 576 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 577 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 578 579 /* set the operation */ 580 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 581 582 /* set the matrix */ 583 upTriFactor->csrMat = new CsrMatrix; 584 upTriFactor->csrMat->num_rows = n; 585 upTriFactor->csrMat->num_cols = n; 586 upTriFactor->csrMat->num_entries = nzUpper; 587 588 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 589 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 590 591 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 592 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 593 594 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 595 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 596 597 /* Create the solve analysis information */ 598 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 599 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 600 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 601 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 602 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 603 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 604 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 605 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 606 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 607 #endif 608 609 /* perform the solve analysis */ 610 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 611 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 612 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 613 upTriFactor->csrMat->column_indices->data().get(), 614 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 615 upTriFactor->solveInfo, 616 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 617 #else 618 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 619 #endif 620 cerr = WaitForCUDA();CHKERRCUDA(cerr); 621 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 622 623 /* assign the pointer */ 624 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 625 upTriFactor->AA_h = AAUp; 626 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 627 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 628 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 629 } else { 630 if (!upTriFactor->AA_h) { 631 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 632 } 633 /* Fill the upper triangular matrix */ 634 offset = nzUpper; 635 for (i=n-1; i>=0; i--) { 636 v = aa + adiag[i+1] + 1; 637 638 /* number of elements NOT on the diagonal */ 639 nz = adiag[i] - adiag[i+1]-1; 640 641 /* decrement the offset */ 642 offset -= (nz+1); 643 644 /* first, set the diagonal elements */ 645 upTriFactor->AA_h[offset] = 1./v[nz]; 646 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 647 } 648 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 649 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 650 } 651 } catch(char *ex) { 652 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 653 } 654 } 655 PetscFunctionReturn(0); 656 } 657 658 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 659 { 660 PetscErrorCode ierr; 661 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 662 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 663 IS isrow = a->row,iscol = a->icol; 664 PetscBool row_identity,col_identity; 665 PetscInt n = A->rmap->n; 666 667 PetscFunctionBegin; 668 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 669 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 670 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 671 672 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 673 cusparseTriFactors->nnz=a->nz; 674 675 A->offloadmask = PETSC_OFFLOAD_BOTH; 676 /* lower triangular indices */ 677 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 678 if (!row_identity && !cusparseTriFactors->rpermIndices) { 679 const PetscInt *r; 680 681 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 682 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 683 cusparseTriFactors->rpermIndices->assign(r, r+n); 684 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 685 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 686 } 687 688 /* upper triangular indices */ 689 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 690 if (!col_identity && !cusparseTriFactors->cpermIndices) { 691 const PetscInt *c; 692 693 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 694 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 695 cusparseTriFactors->cpermIndices->assign(c, c+n); 696 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 697 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 698 } 699 PetscFunctionReturn(0); 700 } 701 702 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 703 { 704 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 705 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 706 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 707 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 708 cusparseStatus_t stat; 709 PetscErrorCode ierr; 710 cudaError_t cerr; 711 PetscInt *AiUp, *AjUp; 712 PetscScalar *AAUp; 713 PetscScalar *AALo; 714 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 715 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 716 const PetscInt *ai = b->i,*aj = b->j,*vj; 717 const MatScalar *aa = b->a,*v; 718 719 PetscFunctionBegin; 720 if (!n) PetscFunctionReturn(0); 721 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 722 try { 723 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 724 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 725 if (!upTriFactor && !loTriFactor) { 726 /* Allocate Space for the upper triangular matrix */ 727 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 728 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 729 730 /* Fill the upper triangular matrix */ 731 AiUp[0]=(PetscInt) 0; 732 AiUp[n]=nzUpper; 733 offset = 0; 734 for (i=0; i<n; i++) { 735 /* set the pointers */ 736 v = aa + ai[i]; 737 vj = aj + ai[i]; 738 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 739 740 /* first, set the diagonal elements */ 741 AjUp[offset] = (PetscInt) i; 742 AAUp[offset] = (MatScalar)1.0/v[nz]; 743 AiUp[i] = offset; 744 AALo[offset] = (MatScalar)1.0/v[nz]; 745 746 offset+=1; 747 if (nz>0) { 748 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 749 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 750 for (j=offset; j<offset+nz; j++) { 751 AAUp[j] = -AAUp[j]; 752 AALo[j] = AAUp[j]/v[nz]; 753 } 754 offset+=nz; 755 } 756 } 757 758 /* allocate space for the triangular factor information */ 759 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 760 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 761 762 /* Create the matrix description */ 763 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 764 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 765 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 766 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 767 #else 768 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 769 #endif 770 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 771 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 772 773 /* set the matrix */ 774 upTriFactor->csrMat = new CsrMatrix; 775 upTriFactor->csrMat->num_rows = A->rmap->n; 776 upTriFactor->csrMat->num_cols = A->cmap->n; 777 upTriFactor->csrMat->num_entries = a->nz; 778 779 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 780 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 781 782 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 783 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 784 785 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 786 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 787 788 /* set the operation */ 789 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 790 791 /* Create the solve analysis information */ 792 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 793 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 794 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 795 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 796 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 797 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 798 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 799 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 800 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 801 #endif 802 803 /* perform the solve analysis */ 804 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 805 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 806 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 807 upTriFactor->csrMat->column_indices->data().get(), 808 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 809 upTriFactor->solveInfo, 810 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 811 #else 812 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 813 #endif 814 cerr = WaitForCUDA();CHKERRCUDA(cerr); 815 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 816 817 /* assign the pointer */ 818 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 819 820 /* allocate space for the triangular factor information */ 821 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 822 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 823 824 /* Create the matrix description */ 825 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 826 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 827 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 828 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 829 #else 830 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 831 #endif 832 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 833 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 834 835 /* set the operation */ 836 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 837 838 /* set the matrix */ 839 loTriFactor->csrMat = new CsrMatrix; 840 loTriFactor->csrMat->num_rows = A->rmap->n; 841 loTriFactor->csrMat->num_cols = A->cmap->n; 842 loTriFactor->csrMat->num_entries = a->nz; 843 844 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 845 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 846 847 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 848 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 849 850 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 851 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 852 853 /* Create the solve analysis information */ 854 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 855 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 856 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 857 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 858 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 859 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 860 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 861 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 862 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 863 #endif 864 865 /* perform the solve analysis */ 866 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 867 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 868 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 869 loTriFactor->csrMat->column_indices->data().get(), 870 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 871 loTriFactor->solveInfo, 872 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 873 #else 874 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 875 #endif 876 cerr = WaitForCUDA();CHKERRCUDA(cerr); 877 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 878 879 /* assign the pointer */ 880 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 881 882 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 883 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 884 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 885 } else { 886 /* Fill the upper triangular matrix */ 887 offset = 0; 888 for (i=0; i<n; i++) { 889 /* set the pointers */ 890 v = aa + ai[i]; 891 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 892 893 /* first, set the diagonal elements */ 894 AAUp[offset] = 1.0/v[nz]; 895 AALo[offset] = 1.0/v[nz]; 896 897 offset+=1; 898 if (nz>0) { 899 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 900 for (j=offset; j<offset+nz; j++) { 901 AAUp[j] = -AAUp[j]; 902 AALo[j] = AAUp[j]/v[nz]; 903 } 904 offset+=nz; 905 } 906 } 907 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 908 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 909 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 910 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 911 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 912 } 913 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 914 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 915 } catch(char *ex) { 916 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 917 } 918 } 919 PetscFunctionReturn(0); 920 } 921 922 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 923 { 924 PetscErrorCode ierr; 925 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 926 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 927 IS ip = a->row; 928 PetscBool perm_identity; 929 PetscInt n = A->rmap->n; 930 931 PetscFunctionBegin; 932 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 933 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 934 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 935 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 936 937 A->offloadmask = PETSC_OFFLOAD_BOTH; 938 939 /* lower triangular indices */ 940 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 941 if (!perm_identity) { 942 IS iip; 943 const PetscInt *irip,*rip; 944 945 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 946 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 947 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 948 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 949 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 950 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 951 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 952 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 953 ierr = ISDestroy(&iip);CHKERRQ(ierr); 954 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 955 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 956 } 957 PetscFunctionReturn(0); 958 } 959 960 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 961 { 962 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 963 IS ip = b->row; 964 PetscBool perm_identity; 965 PetscErrorCode ierr; 966 967 PetscFunctionBegin; 968 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 969 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 970 B->offloadmask = PETSC_OFFLOAD_CPU; 971 /* determine which version of MatSolve needs to be used. */ 972 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 973 if (perm_identity) { 974 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 975 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 976 B->ops->matsolve = NULL; 977 B->ops->matsolvetranspose = NULL; 978 } else { 979 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 980 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 981 B->ops->matsolve = NULL; 982 B->ops->matsolvetranspose = NULL; 983 } 984 985 /* get the triangular factors */ 986 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 987 PetscFunctionReturn(0); 988 } 989 990 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 991 { 992 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 993 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 994 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 995 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 996 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 997 cusparseStatus_t stat; 998 cusparseIndexBase_t indexBase; 999 cusparseMatrixType_t matrixType; 1000 cusparseFillMode_t fillMode; 1001 cusparseDiagType_t diagType; 1002 cudaError_t cerr; 1003 PetscErrorCode ierr; 1004 1005 PetscFunctionBegin; 1006 /* allocate space for the transpose of the lower triangular factor */ 1007 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1008 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1009 1010 /* set the matrix descriptors of the lower triangular factor */ 1011 matrixType = cusparseGetMatType(loTriFactor->descr); 1012 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1013 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1014 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1015 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1016 1017 /* Create the matrix description */ 1018 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1019 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1020 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1021 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1022 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1023 1024 /* set the operation */ 1025 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1026 1027 /* allocate GPU space for the CSC of the lower triangular factor*/ 1028 loTriFactorT->csrMat = new CsrMatrix; 1029 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1030 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1031 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1032 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1033 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1034 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1035 1036 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1037 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1038 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1039 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1040 loTriFactor->csrMat->values->data().get(), 1041 loTriFactor->csrMat->row_offsets->data().get(), 1042 loTriFactor->csrMat->column_indices->data().get(), 1043 loTriFactorT->csrMat->values->data().get(), 1044 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1045 CUSPARSE_ACTION_NUMERIC,indexBase, 1046 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1047 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1048 #endif 1049 1050 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1051 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1052 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1053 loTriFactor->csrMat->values->data().get(), 1054 loTriFactor->csrMat->row_offsets->data().get(), 1055 loTriFactor->csrMat->column_indices->data().get(), 1056 loTriFactorT->csrMat->values->data().get(), 1057 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1058 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1059 CUSPARSE_ACTION_NUMERIC, indexBase, 1060 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1061 #else 1062 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1063 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1064 #endif 1065 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1066 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1067 1068 /* Create the solve analysis information */ 1069 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1070 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1071 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1072 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1073 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1074 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1075 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1076 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1077 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1078 #endif 1079 1080 /* perform the solve analysis */ 1081 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1082 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1083 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1084 loTriFactorT->csrMat->column_indices->data().get(), 1085 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1086 loTriFactorT->solveInfo, 1087 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1088 #else 1089 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1090 #endif 1091 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1092 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1093 1094 /* assign the pointer */ 1095 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1096 1097 /*********************************************/ 1098 /* Now the Transpose of the Upper Tri Factor */ 1099 /*********************************************/ 1100 1101 /* allocate space for the transpose of the upper triangular factor */ 1102 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1103 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1104 1105 /* set the matrix descriptors of the upper triangular factor */ 1106 matrixType = cusparseGetMatType(upTriFactor->descr); 1107 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1108 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1109 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1110 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1111 1112 /* Create the matrix description */ 1113 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1114 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1115 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1116 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1117 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1118 1119 /* set the operation */ 1120 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1121 1122 /* allocate GPU space for the CSC of the upper triangular factor*/ 1123 upTriFactorT->csrMat = new CsrMatrix; 1124 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1125 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1126 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1127 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1128 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1129 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1130 1131 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1132 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1133 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1134 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1135 upTriFactor->csrMat->values->data().get(), 1136 upTriFactor->csrMat->row_offsets->data().get(), 1137 upTriFactor->csrMat->column_indices->data().get(), 1138 upTriFactorT->csrMat->values->data().get(), 1139 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1140 CUSPARSE_ACTION_NUMERIC,indexBase, 1141 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1142 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1143 #endif 1144 1145 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1146 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1147 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1148 upTriFactor->csrMat->values->data().get(), 1149 upTriFactor->csrMat->row_offsets->data().get(), 1150 upTriFactor->csrMat->column_indices->data().get(), 1151 upTriFactorT->csrMat->values->data().get(), 1152 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1153 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1154 CUSPARSE_ACTION_NUMERIC, indexBase, 1155 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1156 #else 1157 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1158 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1159 #endif 1160 1161 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1162 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1163 1164 /* Create the solve analysis information */ 1165 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1166 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1167 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1168 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1169 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1170 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1171 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1172 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1173 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1174 #endif 1175 1176 /* perform the solve analysis */ 1177 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1178 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1179 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1180 upTriFactorT->csrMat->column_indices->data().get(), 1181 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1182 upTriFactorT->solveInfo, 1183 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1184 #else 1185 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1186 #endif 1187 1188 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1189 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1190 1191 /* assign the pointer */ 1192 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1193 PetscFunctionReturn(0); 1194 } 1195 1196 struct PetscScalarToPetscInt 1197 { 1198 __host__ __device__ 1199 PetscInt operator()(PetscScalar s) 1200 { 1201 return (PetscInt)PetscRealPart(s); 1202 } 1203 }; 1204 1205 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A) 1206 { 1207 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1208 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1209 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1210 cusparseStatus_t stat; 1211 cusparseIndexBase_t indexBase; 1212 cudaError_t err; 1213 PetscErrorCode ierr; 1214 1215 PetscFunctionBegin; 1216 if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 1217 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1218 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1219 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1220 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1221 if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1222 if (A->transupdated) PetscFunctionReturn(0); 1223 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1224 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1225 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1226 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1227 } 1228 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1229 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1230 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1231 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1232 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1233 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1234 1235 /* set alpha and beta */ 1236 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1237 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1238 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1239 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1240 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1241 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1242 1243 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1244 CsrMatrix *matrixT = new CsrMatrix; 1245 matstructT->mat = matrixT; 1246 matrixT->num_rows = A->cmap->n; 1247 matrixT->num_cols = A->rmap->n; 1248 matrixT->num_entries = a->nz; 1249 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1250 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1251 matrixT->values = new THRUSTARRAY(a->nz); 1252 1253 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1254 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1255 1256 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1257 stat = cusparseCreateCsr(&matstructT->matDescr, 1258 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1259 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1260 matrixT->values->data().get(), 1261 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1262 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1263 #endif 1264 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1265 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1266 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1267 #else 1268 CsrMatrix *temp = new CsrMatrix; 1269 CsrMatrix *tempT = new CsrMatrix; 1270 /* First convert HYB to CSR */ 1271 temp->num_rows = A->rmap->n; 1272 temp->num_cols = A->cmap->n; 1273 temp->num_entries = a->nz; 1274 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1275 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1276 temp->values = new THRUSTARRAY(a->nz); 1277 1278 stat = cusparse_hyb2csr(cusparsestruct->handle, 1279 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1280 temp->values->data().get(), 1281 temp->row_offsets->data().get(), 1282 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1283 1284 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1285 tempT->num_rows = A->rmap->n; 1286 tempT->num_cols = A->cmap->n; 1287 tempT->num_entries = a->nz; 1288 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1289 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1290 tempT->values = new THRUSTARRAY(a->nz); 1291 1292 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1293 temp->num_cols, temp->num_entries, 1294 temp->values->data().get(), 1295 temp->row_offsets->data().get(), 1296 temp->column_indices->data().get(), 1297 tempT->values->data().get(), 1298 tempT->column_indices->data().get(), 1299 tempT->row_offsets->data().get(), 1300 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1301 1302 /* Last, convert CSC to HYB */ 1303 cusparseHybMat_t hybMat; 1304 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1305 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1306 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1307 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1308 matstructT->descr, tempT->values->data().get(), 1309 tempT->row_offsets->data().get(), 1310 tempT->column_indices->data().get(), 1311 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1312 1313 /* assign the pointer */ 1314 matstructT->mat = hybMat; 1315 A->transupdated = PETSC_TRUE; 1316 /* delete temporaries */ 1317 if (tempT) { 1318 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1319 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1320 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1321 delete (CsrMatrix*) tempT; 1322 } 1323 if (temp) { 1324 if (temp->values) delete (THRUSTARRAY*) temp->values; 1325 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1326 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1327 delete (CsrMatrix*) temp; 1328 } 1329 #endif 1330 } 1331 } 1332 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1333 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1334 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1335 if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1336 if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1337 if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1338 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1339 if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1340 if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1341 if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1342 if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1343 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1344 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1345 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1346 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1347 } 1348 if (!cusparsestruct->csr2csc_i) { 1349 THRUSTARRAY csr2csc_a(matrix->num_entries); 1350 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1351 1352 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1353 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1354 void *csr2cscBuffer; 1355 size_t csr2cscBufferSize; 1356 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1357 A->cmap->n, matrix->num_entries, 1358 matrix->values->data().get(), 1359 cusparsestruct->rowoffsets_gpu->data().get(), 1360 matrix->column_indices->data().get(), 1361 matrixT->values->data().get(), 1362 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1363 CUSPARSE_ACTION_NUMERIC,indexBase, 1364 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1365 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1366 #endif 1367 1368 if (matrix->num_entries) { 1369 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1370 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1371 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1372 1373 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1374 should be filled with indexBase. So I just take a shortcut here. 1375 */ 1376 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1377 A->cmap->n,matrix->num_entries, 1378 csr2csc_a.data().get(), 1379 cusparsestruct->rowoffsets_gpu->data().get(), 1380 matrix->column_indices->data().get(), 1381 matrixT->values->data().get(), 1382 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1383 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1384 CUSPARSE_ACTION_NUMERIC,indexBase, 1385 cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1386 #else 1387 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1388 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1389 #endif 1390 } else { 1391 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1392 } 1393 1394 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1395 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1396 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1397 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1398 #endif 1399 } 1400 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1401 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1402 matrixT->values->begin())); 1403 } 1404 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1405 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1406 /* the compressed row indices is not used for matTranspose */ 1407 matstructT->cprowIndices = NULL; 1408 /* assign the pointer */ 1409 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1410 A->transupdated = PETSC_TRUE; 1411 PetscFunctionReturn(0); 1412 } 1413 1414 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1415 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1416 { 1417 PetscInt n = xx->map->n; 1418 const PetscScalar *barray; 1419 PetscScalar *xarray; 1420 thrust::device_ptr<const PetscScalar> bGPU; 1421 thrust::device_ptr<PetscScalar> xGPU; 1422 cusparseStatus_t stat; 1423 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1424 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1425 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1426 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1427 PetscErrorCode ierr; 1428 1429 PetscFunctionBegin; 1430 /* Analyze the matrix and create the transpose ... on the fly */ 1431 if (!loTriFactorT && !upTriFactorT) { 1432 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1433 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1434 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1435 } 1436 1437 /* Get the GPU pointers */ 1438 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1439 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1440 xGPU = thrust::device_pointer_cast(xarray); 1441 bGPU = thrust::device_pointer_cast(barray); 1442 1443 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1444 /* First, reorder with the row permutation */ 1445 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1446 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1447 xGPU); 1448 1449 /* First, solve U */ 1450 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1451 upTriFactorT->csrMat->num_rows, 1452 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1453 upTriFactorT->csrMat->num_entries, 1454 #endif 1455 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1456 upTriFactorT->csrMat->values->data().get(), 1457 upTriFactorT->csrMat->row_offsets->data().get(), 1458 upTriFactorT->csrMat->column_indices->data().get(), 1459 upTriFactorT->solveInfo, 1460 xarray, 1461 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1462 tempGPU->data().get(), 1463 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1464 #else 1465 tempGPU->data().get());CHKERRCUSPARSE(stat); 1466 #endif 1467 1468 /* Then, solve L */ 1469 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1470 loTriFactorT->csrMat->num_rows, 1471 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1472 loTriFactorT->csrMat->num_entries, 1473 #endif 1474 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1475 loTriFactorT->csrMat->values->data().get(), 1476 loTriFactorT->csrMat->row_offsets->data().get(), 1477 loTriFactorT->csrMat->column_indices->data().get(), 1478 loTriFactorT->solveInfo, 1479 tempGPU->data().get(), 1480 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1481 xarray, 1482 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1483 #else 1484 xarray);CHKERRCUSPARSE(stat); 1485 #endif 1486 1487 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1488 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1489 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1490 tempGPU->begin()); 1491 1492 /* Copy the temporary to the full solution. */ 1493 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1494 1495 /* restore */ 1496 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1497 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1498 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1499 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1500 PetscFunctionReturn(0); 1501 } 1502 1503 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1504 { 1505 const PetscScalar *barray; 1506 PetscScalar *xarray; 1507 cusparseStatus_t stat; 1508 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1509 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1510 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1511 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1512 PetscErrorCode ierr; 1513 1514 PetscFunctionBegin; 1515 /* Analyze the matrix and create the transpose ... on the fly */ 1516 if (!loTriFactorT && !upTriFactorT) { 1517 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1518 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1519 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1520 } 1521 1522 /* Get the GPU pointers */ 1523 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1524 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1525 1526 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1527 /* First, solve U */ 1528 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1529 upTriFactorT->csrMat->num_rows, 1530 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1531 upTriFactorT->csrMat->num_entries, 1532 #endif 1533 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1534 upTriFactorT->csrMat->values->data().get(), 1535 upTriFactorT->csrMat->row_offsets->data().get(), 1536 upTriFactorT->csrMat->column_indices->data().get(), 1537 upTriFactorT->solveInfo, 1538 barray, 1539 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1540 tempGPU->data().get(), 1541 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1542 #else 1543 tempGPU->data().get());CHKERRCUSPARSE(stat); 1544 #endif 1545 1546 /* Then, solve L */ 1547 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1548 loTriFactorT->csrMat->num_rows, 1549 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1550 loTriFactorT->csrMat->num_entries, 1551 #endif 1552 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1553 loTriFactorT->csrMat->values->data().get(), 1554 loTriFactorT->csrMat->row_offsets->data().get(), 1555 loTriFactorT->csrMat->column_indices->data().get(), 1556 loTriFactorT->solveInfo, 1557 tempGPU->data().get(), 1558 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1559 xarray, 1560 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1561 #else 1562 xarray);CHKERRCUSPARSE(stat); 1563 #endif 1564 1565 /* restore */ 1566 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1567 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1568 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1569 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1570 PetscFunctionReturn(0); 1571 } 1572 1573 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1574 { 1575 const PetscScalar *barray; 1576 PetscScalar *xarray; 1577 thrust::device_ptr<const PetscScalar> bGPU; 1578 thrust::device_ptr<PetscScalar> xGPU; 1579 cusparseStatus_t stat; 1580 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1581 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1582 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1583 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1584 PetscErrorCode ierr; 1585 1586 PetscFunctionBegin; 1587 1588 /* Get the GPU pointers */ 1589 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1590 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1591 xGPU = thrust::device_pointer_cast(xarray); 1592 bGPU = thrust::device_pointer_cast(barray); 1593 1594 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1595 /* First, reorder with the row permutation */ 1596 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1597 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1598 tempGPU->begin()); 1599 1600 /* Next, solve L */ 1601 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1602 loTriFactor->csrMat->num_rows, 1603 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1604 loTriFactor->csrMat->num_entries, 1605 #endif 1606 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1607 loTriFactor->csrMat->values->data().get(), 1608 loTriFactor->csrMat->row_offsets->data().get(), 1609 loTriFactor->csrMat->column_indices->data().get(), 1610 loTriFactor->solveInfo, 1611 tempGPU->data().get(), 1612 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1613 xarray, 1614 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1615 #else 1616 xarray);CHKERRCUSPARSE(stat); 1617 #endif 1618 1619 /* Then, solve U */ 1620 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1621 upTriFactor->csrMat->num_rows, 1622 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1623 upTriFactor->csrMat->num_entries, 1624 #endif 1625 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1626 upTriFactor->csrMat->values->data().get(), 1627 upTriFactor->csrMat->row_offsets->data().get(), 1628 upTriFactor->csrMat->column_indices->data().get(), 1629 upTriFactor->solveInfo,xarray, 1630 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1631 tempGPU->data().get(), 1632 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1633 #else 1634 tempGPU->data().get());CHKERRCUSPARSE(stat); 1635 #endif 1636 1637 1638 /* Last, reorder with the column permutation */ 1639 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1640 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1641 xGPU); 1642 1643 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1644 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1645 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1646 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1647 PetscFunctionReturn(0); 1648 } 1649 1650 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1651 { 1652 const PetscScalar *barray; 1653 PetscScalar *xarray; 1654 cusparseStatus_t stat; 1655 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1656 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1657 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1658 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1659 PetscErrorCode ierr; 1660 1661 PetscFunctionBegin; 1662 /* Get the GPU pointers */ 1663 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1664 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1665 1666 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1667 /* First, solve L */ 1668 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1669 loTriFactor->csrMat->num_rows, 1670 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1671 loTriFactor->csrMat->num_entries, 1672 #endif 1673 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1674 loTriFactor->csrMat->values->data().get(), 1675 loTriFactor->csrMat->row_offsets->data().get(), 1676 loTriFactor->csrMat->column_indices->data().get(), 1677 loTriFactor->solveInfo, 1678 barray, 1679 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1680 tempGPU->data().get(), 1681 loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1682 #else 1683 tempGPU->data().get());CHKERRCUSPARSE(stat); 1684 #endif 1685 1686 1687 /* Next, solve U */ 1688 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1689 upTriFactor->csrMat->num_rows, 1690 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1691 upTriFactor->csrMat->num_entries, 1692 #endif 1693 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1694 upTriFactor->csrMat->values->data().get(), 1695 upTriFactor->csrMat->row_offsets->data().get(), 1696 upTriFactor->csrMat->column_indices->data().get(), 1697 upTriFactor->solveInfo, 1698 tempGPU->data().get(), 1699 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1700 xarray, 1701 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1702 #else 1703 xarray);CHKERRCUSPARSE(stat); 1704 #endif 1705 1706 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1707 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1708 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1709 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1710 PetscFunctionReturn(0); 1711 } 1712 1713 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1714 { 1715 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1716 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1717 cudaError_t cerr; 1718 PetscErrorCode ierr; 1719 1720 PetscFunctionBegin; 1721 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1722 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1723 1724 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1725 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1726 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1727 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1728 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1729 A->offloadmask = PETSC_OFFLOAD_BOTH; 1730 } 1731 PetscFunctionReturn(0); 1732 } 1733 1734 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1735 { 1736 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1737 PetscErrorCode ierr; 1738 1739 PetscFunctionBegin; 1740 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1741 *array = a->a; 1742 A->offloadmask = PETSC_OFFLOAD_CPU; 1743 PetscFunctionReturn(0); 1744 } 1745 1746 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1747 { 1748 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1749 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1750 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1751 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1752 PetscErrorCode ierr; 1753 cusparseStatus_t stat; 1754 PetscBool both = PETSC_TRUE; 1755 cudaError_t err; 1756 1757 PetscFunctionBegin; 1758 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1759 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1760 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1761 CsrMatrix *matrix; 1762 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1763 1764 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1765 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1766 matrix->values->assign(a->a, a->a+a->nz); 1767 err = WaitForCUDA();CHKERRCUDA(err); 1768 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1769 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1770 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1771 } else { 1772 PetscInt nnz; 1773 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1774 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1775 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1776 delete cusparsestruct->workVector; 1777 delete cusparsestruct->rowoffsets_gpu; 1778 cusparsestruct->workVector = NULL; 1779 cusparsestruct->rowoffsets_gpu = NULL; 1780 try { 1781 if (a->compressedrow.use) { 1782 m = a->compressedrow.nrows; 1783 ii = a->compressedrow.i; 1784 ridx = a->compressedrow.rindex; 1785 } else { 1786 m = A->rmap->n; 1787 ii = a->i; 1788 ridx = NULL; 1789 } 1790 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1791 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1792 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1793 else nnz = a->nz; 1794 1795 /* create cusparse matrix */ 1796 cusparsestruct->nrows = m; 1797 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1798 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1799 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1800 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1801 1802 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1803 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1804 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1805 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1806 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1807 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1808 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1809 1810 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1811 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1812 /* set the matrix */ 1813 CsrMatrix *mat= new CsrMatrix; 1814 mat->num_rows = m; 1815 mat->num_cols = A->cmap->n; 1816 mat->num_entries = nnz; 1817 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1818 mat->row_offsets->assign(ii, ii + m+1); 1819 1820 mat->column_indices = new THRUSTINTARRAY32(nnz); 1821 mat->column_indices->assign(a->j, a->j+nnz); 1822 1823 mat->values = new THRUSTARRAY(nnz); 1824 if (a->a) mat->values->assign(a->a, a->a+nnz); 1825 1826 /* assign the pointer */ 1827 matstruct->mat = mat; 1828 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1829 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1830 stat = cusparseCreateCsr(&matstruct->matDescr, 1831 mat->num_rows, mat->num_cols, mat->num_entries, 1832 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1833 mat->values->data().get(), 1834 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1835 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1836 } 1837 #endif 1838 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1839 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1840 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1841 #else 1842 CsrMatrix *mat= new CsrMatrix; 1843 mat->num_rows = m; 1844 mat->num_cols = A->cmap->n; 1845 mat->num_entries = nnz; 1846 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1847 mat->row_offsets->assign(ii, ii + m+1); 1848 1849 mat->column_indices = new THRUSTINTARRAY32(nnz); 1850 mat->column_indices->assign(a->j, a->j+nnz); 1851 1852 mat->values = new THRUSTARRAY(nnz); 1853 if (a->a) mat->values->assign(a->a, a->a+nnz); 1854 1855 cusparseHybMat_t hybMat; 1856 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1857 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1858 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1859 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1860 matstruct->descr, mat->values->data().get(), 1861 mat->row_offsets->data().get(), 1862 mat->column_indices->data().get(), 1863 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1864 /* assign the pointer */ 1865 matstruct->mat = hybMat; 1866 1867 if (mat) { 1868 if (mat->values) delete (THRUSTARRAY*)mat->values; 1869 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1870 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1871 delete (CsrMatrix*)mat; 1872 } 1873 #endif 1874 } 1875 1876 /* assign the compressed row indices */ 1877 if (a->compressedrow.use) { 1878 cusparsestruct->workVector = new THRUSTARRAY(m); 1879 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1880 matstruct->cprowIndices->assign(ridx,ridx+m); 1881 tmp = m; 1882 } else { 1883 cusparsestruct->workVector = NULL; 1884 matstruct->cprowIndices = NULL; 1885 tmp = 0; 1886 } 1887 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1888 1889 /* assign the pointer */ 1890 cusparsestruct->mat = matstruct; 1891 } catch(char *ex) { 1892 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1893 } 1894 err = WaitForCUDA();CHKERRCUDA(err); 1895 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1896 cusparsestruct->nonzerostate = A->nonzerostate; 1897 } 1898 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1899 } 1900 PetscFunctionReturn(0); 1901 } 1902 1903 struct VecCUDAPlusEquals 1904 { 1905 template <typename Tuple> 1906 __host__ __device__ 1907 void operator()(Tuple t) 1908 { 1909 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1910 } 1911 }; 1912 1913 struct VecCUDAEquals 1914 { 1915 template <typename Tuple> 1916 __host__ __device__ 1917 void operator()(Tuple t) 1918 { 1919 thrust::get<1>(t) = thrust::get<0>(t); 1920 } 1921 }; 1922 1923 struct VecCUDAEqualsReverse 1924 { 1925 template <typename Tuple> 1926 __host__ __device__ 1927 void operator()(Tuple t) 1928 { 1929 thrust::get<0>(t) = thrust::get<1>(t); 1930 } 1931 }; 1932 1933 struct MatMatCusparse { 1934 PetscBool cisdense; 1935 PetscScalar *Bt; 1936 Mat X; 1937 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1938 PetscLogDouble flops; 1939 CsrMatrix *Bcsr; 1940 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1941 cusparseSpMatDescr_t matSpBDescr; 1942 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1943 cusparseDnMatDescr_t matBDescr; 1944 cusparseDnMatDescr_t matCDescr; 1945 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1946 size_t mmBufferSize; 1947 void *mmBuffer; 1948 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1949 cusparseSpGEMMDescr_t spgemmDesc; 1950 #endif 1951 }; 1952 1953 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1954 { 1955 PetscErrorCode ierr; 1956 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1957 cudaError_t cerr; 1958 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1959 cusparseStatus_t stat; 1960 #endif 1961 1962 PetscFunctionBegin; 1963 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1964 delete mmdata->Bcsr; 1965 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1966 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1967 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1968 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1969 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1970 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1971 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1972 #endif 1973 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1974 ierr = PetscFree(data);CHKERRQ(ierr); 1975 PetscFunctionReturn(0); 1976 } 1977 1978 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1979 1980 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1981 { 1982 Mat_Product *product = C->product; 1983 Mat A,B; 1984 PetscInt m,n,blda,clda; 1985 PetscBool flg,biscuda; 1986 Mat_SeqAIJCUSPARSE *cusp; 1987 cusparseStatus_t stat; 1988 cusparseOperation_t opA; 1989 const PetscScalar *barray; 1990 PetscScalar *carray; 1991 PetscErrorCode ierr; 1992 MatMatCusparse *mmdata; 1993 Mat_SeqAIJCUSPARSEMultStruct *mat; 1994 CsrMatrix *csrmat; 1995 1996 PetscFunctionBegin; 1997 MatCheckProduct(C,1); 1998 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 1999 mmdata = (MatMatCusparse*)product->data; 2000 A = product->A; 2001 B = product->B; 2002 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2003 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2004 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2005 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2006 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2007 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2008 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2009 switch (product->type) { 2010 case MATPRODUCT_AB: 2011 case MATPRODUCT_PtAP: 2012 mat = cusp->mat; 2013 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2014 m = A->rmap->n; 2015 n = B->cmap->n; 2016 break; 2017 case MATPRODUCT_AtB: 2018 if (!A->form_explicit_transpose) { 2019 mat = cusp->mat; 2020 opA = CUSPARSE_OPERATION_TRANSPOSE; 2021 } else { 2022 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2023 mat = cusp->matTranspose; 2024 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2025 } 2026 m = A->cmap->n; 2027 n = B->cmap->n; 2028 break; 2029 case MATPRODUCT_ABt: 2030 case MATPRODUCT_RARt: 2031 mat = cusp->mat; 2032 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2033 m = A->rmap->n; 2034 n = B->rmap->n; 2035 break; 2036 default: 2037 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2038 } 2039 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2040 csrmat = (CsrMatrix*)mat->mat; 2041 /* if the user passed a CPU matrix, copy the data to the GPU */ 2042 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2043 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2044 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2045 2046 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2047 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2048 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2049 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2050 } else { 2051 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2052 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2053 } 2054 2055 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2056 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2057 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2058 /* (re)allcoate mmBuffer if not initialized or LDAs are different */ 2059 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2060 size_t mmBufferSize; 2061 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2062 if (!mmdata->matBDescr) { 2063 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2064 mmdata->Blda = blda; 2065 } 2066 2067 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2068 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2069 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2070 mmdata->Clda = clda; 2071 } 2072 2073 if (!mat->matDescr) { 2074 stat = cusparseCreateCsr(&mat->matDescr, 2075 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2076 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2077 csrmat->values->data().get(), 2078 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2079 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2080 } 2081 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2082 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2083 mmdata->matCDescr,cusparse_scalartype, 2084 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2085 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2086 cudaError_t cerr; 2087 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2088 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2089 mmdata->mmBufferSize = mmBufferSize; 2090 } 2091 mmdata->initialized = PETSC_TRUE; 2092 } else { 2093 /* to be safe, always update pointers of the mats */ 2094 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2095 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2096 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2097 } 2098 2099 /* do cusparseSpMM, which supports transpose on B */ 2100 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2101 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2102 mmdata->matCDescr,cusparse_scalartype, 2103 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2104 #else 2105 PetscInt k; 2106 /* cusparseXcsrmm does not support transpose on B */ 2107 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2108 cublasHandle_t cublasv2handle; 2109 cublasStatus_t cerr; 2110 2111 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2112 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2113 B->cmap->n,B->rmap->n, 2114 &PETSC_CUSPARSE_ONE ,barray,blda, 2115 &PETSC_CUSPARSE_ZERO,barray,blda, 2116 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2117 blda = B->cmap->n; 2118 k = B->cmap->n; 2119 } else { 2120 k = B->rmap->n; 2121 } 2122 2123 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2124 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2125 csrmat->num_entries,mat->alpha_one,mat->descr, 2126 csrmat->values->data().get(), 2127 csrmat->row_offsets->data().get(), 2128 csrmat->column_indices->data().get(), 2129 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2130 carray,clda);CHKERRCUSPARSE(stat); 2131 #endif 2132 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2133 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2134 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2135 if (product->type == MATPRODUCT_RARt) { 2136 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2137 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2138 } else if (product->type == MATPRODUCT_PtAP) { 2139 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2140 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2141 } else { 2142 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2143 } 2144 if (mmdata->cisdense) { 2145 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2146 } 2147 if (!biscuda) { 2148 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2149 } 2150 PetscFunctionReturn(0); 2151 } 2152 2153 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2154 { 2155 Mat_Product *product = C->product; 2156 Mat A,B; 2157 PetscInt m,n; 2158 PetscBool cisdense,flg; 2159 PetscErrorCode ierr; 2160 MatMatCusparse *mmdata; 2161 Mat_SeqAIJCUSPARSE *cusp; 2162 2163 PetscFunctionBegin; 2164 MatCheckProduct(C,1); 2165 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2166 A = product->A; 2167 B = product->B; 2168 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2169 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2170 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2171 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2172 switch (product->type) { 2173 case MATPRODUCT_AB: 2174 m = A->rmap->n; 2175 n = B->cmap->n; 2176 break; 2177 case MATPRODUCT_AtB: 2178 m = A->cmap->n; 2179 n = B->cmap->n; 2180 break; 2181 case MATPRODUCT_ABt: 2182 m = A->rmap->n; 2183 n = B->rmap->n; 2184 break; 2185 case MATPRODUCT_PtAP: 2186 m = B->cmap->n; 2187 n = B->cmap->n; 2188 break; 2189 case MATPRODUCT_RARt: 2190 m = B->rmap->n; 2191 n = B->rmap->n; 2192 break; 2193 default: 2194 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2195 } 2196 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2197 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2198 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2199 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2200 2201 /* product data */ 2202 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2203 mmdata->cisdense = cisdense; 2204 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2205 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2206 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2207 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2208 } 2209 #endif 2210 /* for these products we need intermediate storage */ 2211 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2212 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2213 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2214 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2215 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2216 } else { 2217 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2218 } 2219 } 2220 C->product->data = mmdata; 2221 C->product->destroy = MatDestroy_MatMatCusparse; 2222 2223 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2224 PetscFunctionReturn(0); 2225 } 2226 2227 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2228 { 2229 Mat_Product *product = C->product; 2230 Mat A,B; 2231 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2232 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2233 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2234 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2235 PetscBool flg; 2236 PetscErrorCode ierr; 2237 cusparseStatus_t stat; 2238 cudaError_t cerr; 2239 MatProductType ptype; 2240 MatMatCusparse *mmdata; 2241 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2242 cusparseSpMatDescr_t BmatSpDescr; 2243 #endif 2244 2245 PetscFunctionBegin; 2246 MatCheckProduct(C,1); 2247 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2248 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2249 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2250 mmdata = (MatMatCusparse*)C->product->data; 2251 A = product->A; 2252 B = product->B; 2253 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2254 mmdata->reusesym = PETSC_FALSE; 2255 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2256 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2257 Cmat = Ccusp->mat; 2258 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2259 Ccsr = (CsrMatrix*)Cmat->mat; 2260 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2261 goto finalize; 2262 } 2263 if (!c->nz) goto finalize; 2264 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2265 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2266 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2267 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2268 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2269 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2270 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2271 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2272 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2273 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2274 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2275 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2276 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2277 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2278 2279 ptype = product->type; 2280 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2281 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2282 switch (ptype) { 2283 case MATPRODUCT_AB: 2284 Amat = Acusp->mat; 2285 Bmat = Bcusp->mat; 2286 break; 2287 case MATPRODUCT_AtB: 2288 Amat = Acusp->matTranspose; 2289 Bmat = Bcusp->mat; 2290 break; 2291 case MATPRODUCT_ABt: 2292 Amat = Acusp->mat; 2293 Bmat = Bcusp->matTranspose; 2294 break; 2295 default: 2296 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2297 } 2298 Cmat = Ccusp->mat; 2299 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2300 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2301 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2302 Acsr = (CsrMatrix*)Amat->mat; 2303 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2304 Ccsr = (CsrMatrix*)Cmat->mat; 2305 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2306 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2307 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2308 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2309 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2310 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2311 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2312 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2313 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2314 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2315 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2316 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2317 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2318 #else 2319 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2320 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2321 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2322 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2323 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2324 #endif 2325 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2326 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2327 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2328 C->offloadmask = PETSC_OFFLOAD_GPU; 2329 finalize: 2330 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2331 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2332 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2333 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2334 c->reallocs = 0; 2335 C->info.mallocs += 0; 2336 C->info.nz_unneeded = 0; 2337 C->assembled = C->was_assembled = PETSC_TRUE; 2338 C->num_ass++; 2339 PetscFunctionReturn(0); 2340 } 2341 2342 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2343 { 2344 Mat_Product *product = C->product; 2345 Mat A,B; 2346 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2347 Mat_SeqAIJ *a,*b,*c; 2348 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2349 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2350 PetscInt i,j,m,n,k; 2351 PetscBool flg; 2352 PetscErrorCode ierr; 2353 cusparseStatus_t stat; 2354 cudaError_t cerr; 2355 MatProductType ptype; 2356 MatMatCusparse *mmdata; 2357 PetscLogDouble flops; 2358 PetscBool biscompressed,ciscompressed; 2359 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2360 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2361 size_t bufSize2; 2362 cusparseSpMatDescr_t BmatSpDescr; 2363 #else 2364 int cnz; 2365 #endif 2366 2367 PetscFunctionBegin; 2368 MatCheckProduct(C,1); 2369 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2370 A = product->A; 2371 B = product->B; 2372 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2373 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2374 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2375 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2376 a = (Mat_SeqAIJ*)A->data; 2377 b = (Mat_SeqAIJ*)B->data; 2378 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2379 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2380 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2381 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2382 2383 /* product data */ 2384 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2385 C->product->data = mmdata; 2386 C->product->destroy = MatDestroy_MatMatCusparse; 2387 2388 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2389 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2390 ptype = product->type; 2391 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2392 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2393 biscompressed = PETSC_FALSE; 2394 ciscompressed = PETSC_FALSE; 2395 switch (ptype) { 2396 case MATPRODUCT_AB: 2397 m = A->rmap->n; 2398 n = B->cmap->n; 2399 k = A->cmap->n; 2400 Amat = Acusp->mat; 2401 Bmat = Bcusp->mat; 2402 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2403 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2404 break; 2405 case MATPRODUCT_AtB: 2406 m = A->cmap->n; 2407 n = B->cmap->n; 2408 k = A->rmap->n; 2409 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2410 Amat = Acusp->matTranspose; 2411 Bmat = Bcusp->mat; 2412 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2413 break; 2414 case MATPRODUCT_ABt: 2415 m = A->rmap->n; 2416 n = B->rmap->n; 2417 k = A->cmap->n; 2418 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 2419 Amat = Acusp->mat; 2420 Bmat = Bcusp->matTranspose; 2421 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2422 break; 2423 default: 2424 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2425 } 2426 2427 /* create cusparse matrix */ 2428 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2429 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2430 c = (Mat_SeqAIJ*)C->data; 2431 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2432 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2433 Ccsr = new CsrMatrix; 2434 2435 c->compressedrow.use = ciscompressed; 2436 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2437 c->compressedrow.nrows = a->compressedrow.nrows; 2438 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2439 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2440 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2441 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2442 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2443 } else { 2444 c->compressedrow.nrows = 0; 2445 c->compressedrow.i = NULL; 2446 c->compressedrow.rindex = NULL; 2447 Ccusp->workVector = NULL; 2448 Cmat->cprowIndices = NULL; 2449 } 2450 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2451 Ccusp->mat = Cmat; 2452 Ccusp->mat->mat = Ccsr; 2453 Ccsr->num_rows = Ccusp->nrows; 2454 Ccsr->num_cols = n; 2455 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2456 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2457 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2458 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2459 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2460 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2461 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2462 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2463 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2464 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2465 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2466 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2467 c->nz = 0; 2468 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2469 Ccsr->values = new THRUSTARRAY(c->nz); 2470 goto finalizesym; 2471 } 2472 2473 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2474 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2475 Acsr = (CsrMatrix*)Amat->mat; 2476 if (!biscompressed) { 2477 Bcsr = (CsrMatrix*)Bmat->mat; 2478 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2479 BmatSpDescr = Bmat->matDescr; 2480 #endif 2481 } else { /* we need to use row offsets for the full matrix */ 2482 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2483 Bcsr = new CsrMatrix; 2484 Bcsr->num_rows = B->rmap->n; 2485 Bcsr->num_cols = cBcsr->num_cols; 2486 Bcsr->num_entries = cBcsr->num_entries; 2487 Bcsr->column_indices = cBcsr->column_indices; 2488 Bcsr->values = cBcsr->values; 2489 if (!Bcusp->rowoffsets_gpu) { 2490 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2491 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2492 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2493 } 2494 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2495 mmdata->Bcsr = Bcsr; 2496 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2497 if (Bcsr->num_rows && Bcsr->num_cols) { 2498 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2499 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2500 Bcsr->values->data().get(), 2501 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2502 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2503 } 2504 BmatSpDescr = mmdata->matSpBDescr; 2505 #endif 2506 } 2507 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2508 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2509 /* precompute flops count */ 2510 if (ptype == MATPRODUCT_AB) { 2511 for (i=0, flops = 0; i<A->rmap->n; i++) { 2512 const PetscInt st = a->i[i]; 2513 const PetscInt en = a->i[i+1]; 2514 for (j=st; j<en; j++) { 2515 const PetscInt brow = a->j[j]; 2516 flops += 2.*(b->i[brow+1] - b->i[brow]); 2517 } 2518 } 2519 } else if (ptype == MATPRODUCT_AtB) { 2520 for (i=0, flops = 0; i<A->rmap->n; i++) { 2521 const PetscInt anzi = a->i[i+1] - a->i[i]; 2522 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2523 flops += (2.*anzi)*bnzi; 2524 } 2525 } else { /* TODO */ 2526 flops = 0.; 2527 } 2528 2529 mmdata->flops = flops; 2530 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2531 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2532 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2533 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2534 NULL, NULL, NULL, 2535 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2536 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2537 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2538 /* ask bufferSize bytes for external memory */ 2539 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2540 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2541 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2542 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2543 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2544 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2545 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2546 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2547 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2548 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2549 /* ask bufferSize again bytes for external memory */ 2550 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2551 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2552 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2553 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2554 /* The CUSPARSE documentation is not clear, nor the API 2555 We need both buffers to perform the operations properly! 2556 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2557 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2558 is stored in the descriptor! What a messy API... */ 2559 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2560 /* compute the intermediate product of A * B */ 2561 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2562 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2563 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2564 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2565 /* get matrix C non-zero entries C_nnz1 */ 2566 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2567 c->nz = (PetscInt) C_nnz1; 2568 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2569 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2570 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2571 Ccsr->values = new THRUSTARRAY(c->nz); 2572 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2573 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2574 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2575 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2576 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2577 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2578 #else 2579 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2580 stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2581 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2582 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2583 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2584 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2585 c->nz = cnz; 2586 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2587 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2588 Ccsr->values = new THRUSTARRAY(c->nz); 2589 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2590 2591 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2592 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2593 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2594 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2595 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2596 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2597 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2598 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2599 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2600 #endif 2601 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2602 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2603 finalizesym: 2604 c->singlemalloc = PETSC_FALSE; 2605 c->free_a = PETSC_TRUE; 2606 c->free_ij = PETSC_TRUE; 2607 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2608 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2609 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2610 PetscInt *d_i = c->i; 2611 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2612 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2613 ii = *Ccsr->row_offsets; 2614 jj = *Ccsr->column_indices; 2615 if (ciscompressed) d_i = c->compressedrow.i; 2616 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2617 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2618 } else { 2619 PetscInt *d_i = c->i; 2620 if (ciscompressed) d_i = c->compressedrow.i; 2621 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2622 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2623 } 2624 if (ciscompressed) { /* need to expand host row offsets */ 2625 PetscInt r = 0; 2626 c->i[0] = 0; 2627 for (k = 0; k < c->compressedrow.nrows; k++) { 2628 const PetscInt next = c->compressedrow.rindex[k]; 2629 const PetscInt old = c->compressedrow.i[k]; 2630 for (; r < next; r++) c->i[r+1] = old; 2631 } 2632 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2633 } 2634 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2635 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2636 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2637 c->maxnz = c->nz; 2638 c->nonzerorowcnt = 0; 2639 c->rmax = 0; 2640 for (k = 0; k < m; k++) { 2641 const PetscInt nn = c->i[k+1] - c->i[k]; 2642 c->ilen[k] = c->imax[k] = nn; 2643 c->nonzerorowcnt += (PetscInt)!!nn; 2644 c->rmax = PetscMax(c->rmax,nn); 2645 } 2646 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2647 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2648 Ccsr->num_entries = c->nz; 2649 2650 C->nonzerostate++; 2651 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2652 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2653 Ccusp->nonzerostate = C->nonzerostate; 2654 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2655 C->preallocated = PETSC_TRUE; 2656 C->assembled = PETSC_FALSE; 2657 C->was_assembled = PETSC_FALSE; 2658 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2659 mmdata->reusesym = PETSC_TRUE; 2660 C->offloadmask = PETSC_OFFLOAD_GPU; 2661 } 2662 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2663 PetscFunctionReturn(0); 2664 } 2665 2666 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2667 2668 /* handles sparse or dense B */ 2669 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2670 { 2671 Mat_Product *product = mat->product; 2672 PetscErrorCode ierr; 2673 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2674 2675 PetscFunctionBegin; 2676 MatCheckProduct(mat,1); 2677 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2678 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2679 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2680 } 2681 if (product->type == MATPRODUCT_ABC) { 2682 Ciscusp = PETSC_FALSE; 2683 if (!product->C->boundtocpu) { 2684 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2685 } 2686 } 2687 if (isdense) { 2688 switch (product->type) { 2689 case MATPRODUCT_AB: 2690 case MATPRODUCT_AtB: 2691 case MATPRODUCT_ABt: 2692 case MATPRODUCT_PtAP: 2693 case MATPRODUCT_RARt: 2694 if (product->A->boundtocpu) { 2695 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2696 } else { 2697 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2698 } 2699 break; 2700 case MATPRODUCT_ABC: 2701 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2702 break; 2703 default: 2704 break; 2705 } 2706 } else if (Biscusp && Ciscusp) { 2707 switch (product->type) { 2708 case MATPRODUCT_AB: 2709 case MATPRODUCT_AtB: 2710 case MATPRODUCT_ABt: 2711 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2712 break; 2713 case MATPRODUCT_PtAP: 2714 case MATPRODUCT_RARt: 2715 case MATPRODUCT_ABC: 2716 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2717 break; 2718 default: 2719 break; 2720 } 2721 } else { /* fallback for AIJ */ 2722 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2723 } 2724 PetscFunctionReturn(0); 2725 } 2726 2727 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2728 { 2729 PetscErrorCode ierr; 2730 2731 PetscFunctionBegin; 2732 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2733 PetscFunctionReturn(0); 2734 } 2735 2736 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2737 { 2738 PetscErrorCode ierr; 2739 2740 PetscFunctionBegin; 2741 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2742 PetscFunctionReturn(0); 2743 } 2744 2745 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2746 { 2747 PetscErrorCode ierr; 2748 2749 PetscFunctionBegin; 2750 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2751 PetscFunctionReturn(0); 2752 } 2753 2754 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2755 { 2756 PetscErrorCode ierr; 2757 2758 PetscFunctionBegin; 2759 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2760 PetscFunctionReturn(0); 2761 } 2762 2763 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2764 { 2765 PetscErrorCode ierr; 2766 2767 PetscFunctionBegin; 2768 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2769 PetscFunctionReturn(0); 2770 } 2771 2772 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2773 { 2774 int i = blockIdx.x*blockDim.x + threadIdx.x; 2775 if (i < n) y[idx[i]] += x[i]; 2776 } 2777 2778 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2779 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2780 { 2781 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2782 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2783 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2784 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2785 PetscErrorCode ierr; 2786 cusparseStatus_t stat; 2787 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2788 PetscBool compressed; 2789 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2790 PetscInt nx,ny; 2791 #endif 2792 2793 PetscFunctionBegin; 2794 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 2795 if (!a->nonzerorowcnt) { 2796 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2797 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2798 PetscFunctionReturn(0); 2799 } 2800 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2801 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2802 if (!trans) { 2803 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2804 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2805 } else { 2806 if (herm || !A->form_explicit_transpose) { 2807 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2808 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2809 } else { 2810 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);} 2811 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2812 } 2813 } 2814 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2815 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2816 2817 try { 2818 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2819 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2820 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2821 2822 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2823 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2824 /* z = A x + beta y. 2825 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2826 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2827 */ 2828 xptr = xarray; 2829 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2830 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2831 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2832 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2833 allocated to accommodate different uses. So we get the length info directly from mat. 2834 */ 2835 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2836 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2837 nx = mat->num_cols; 2838 ny = mat->num_rows; 2839 } 2840 #endif 2841 } else { 2842 /* z = A^T x + beta y 2843 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2844 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2845 */ 2846 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2847 dptr = zarray; 2848 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2849 if (compressed) { /* Scatter x to work vector */ 2850 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2851 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2852 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2853 VecCUDAEqualsReverse()); 2854 } 2855 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2856 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2857 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2858 nx = mat->num_rows; 2859 ny = mat->num_cols; 2860 } 2861 #endif 2862 } 2863 2864 /* csr_spmv does y = alpha op(A) x + beta y */ 2865 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2866 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2867 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2868 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2869 cudaError_t cerr; 2870 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2871 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2872 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2873 matstruct->matDescr, 2874 matstruct->cuSpMV[opA].vecXDescr, beta, 2875 matstruct->cuSpMV[opA].vecYDescr, 2876 cusparse_scalartype, 2877 cusparsestruct->spmvAlg, 2878 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2879 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2880 2881 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2882 } else { 2883 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2884 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2885 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2886 } 2887 2888 stat = cusparseSpMV(cusparsestruct->handle, opA, 2889 matstruct->alpha_one, 2890 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */ 2891 matstruct->cuSpMV[opA].vecXDescr, 2892 beta, 2893 matstruct->cuSpMV[opA].vecYDescr, 2894 cusparse_scalartype, 2895 cusparsestruct->spmvAlg, 2896 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2897 #else 2898 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2899 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2900 mat->num_rows, mat->num_cols, 2901 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2902 mat->values->data().get(), mat->row_offsets->data().get(), 2903 mat->column_indices->data().get(), xptr, beta, 2904 dptr);CHKERRCUSPARSE(stat); 2905 #endif 2906 } else { 2907 if (cusparsestruct->nrows) { 2908 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2909 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2910 #else 2911 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2912 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2913 matstruct->alpha_one, matstruct->descr, hybMat, 2914 xptr, beta, 2915 dptr);CHKERRCUSPARSE(stat); 2916 #endif 2917 } 2918 } 2919 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2920 2921 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2922 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2923 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2924 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2925 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2926 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2927 } 2928 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2929 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2930 } 2931 2932 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2933 if (compressed) { 2934 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2935 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 2936 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 2937 prevent that. So I just add a ScatterAdd kernel. 2938 */ 2939 #if 0 2940 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2941 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 2942 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2943 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2944 VecCUDAPlusEquals()); 2945 #else 2946 PetscInt n = matstruct->cprowIndices->size(); 2947 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 2948 #endif 2949 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2950 } 2951 } else { 2952 if (yy && yy != zz) { 2953 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2954 } 2955 } 2956 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2957 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2958 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2959 } catch(char *ex) { 2960 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2961 } 2962 if (yy) { 2963 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2964 } else { 2965 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2966 } 2967 PetscFunctionReturn(0); 2968 } 2969 2970 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2971 { 2972 PetscErrorCode ierr; 2973 2974 PetscFunctionBegin; 2975 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2976 PetscFunctionReturn(0); 2977 } 2978 2979 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2980 { 2981 PetscErrorCode ierr; 2982 PetscSplitCSRDataStructure *d_mat = NULL; 2983 PetscFunctionBegin; 2984 if (A->factortype == MAT_FACTOR_NONE) { 2985 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2986 } 2987 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2988 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2989 if (d_mat) { 2990 A->offloadmask = PETSC_OFFLOAD_GPU; 2991 } 2992 2993 PetscFunctionReturn(0); 2994 } 2995 2996 /* --------------------------------------------------------------------------------*/ 2997 /*@ 2998 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2999 (the default parallel PETSc format). This matrix will ultimately pushed down 3000 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3001 assembly performance the user should preallocate the matrix storage by setting 3002 the parameter nz (or the array nnz). By setting these parameters accurately, 3003 performance during matrix assembly can be increased by more than a factor of 50. 3004 3005 Collective 3006 3007 Input Parameters: 3008 + comm - MPI communicator, set to PETSC_COMM_SELF 3009 . m - number of rows 3010 . n - number of columns 3011 . nz - number of nonzeros per row (same for all rows) 3012 - nnz - array containing the number of nonzeros in the various rows 3013 (possibly different for each row) or NULL 3014 3015 Output Parameter: 3016 . A - the matrix 3017 3018 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3019 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3020 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3021 3022 Notes: 3023 If nnz is given then nz is ignored 3024 3025 The AIJ format (also called the Yale sparse matrix format or 3026 compressed row storage), is fully compatible with standard Fortran 77 3027 storage. That is, the stored row and column indices can begin at 3028 either one (as in Fortran) or zero. See the users' manual for details. 3029 3030 Specify the preallocated storage with either nz or nnz (not both). 3031 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3032 allocation. For large problems you MUST preallocate memory or you 3033 will get TERRIBLE performance, see the users' manual chapter on matrices. 3034 3035 By default, this format uses inodes (identical nodes) when possible, to 3036 improve numerical efficiency of matrix-vector products and solves. We 3037 search for consecutive rows with the same nonzero structure, thereby 3038 reusing matrix information to achieve increased efficiency. 3039 3040 Level: intermediate 3041 3042 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3043 @*/ 3044 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3045 { 3046 PetscErrorCode ierr; 3047 3048 PetscFunctionBegin; 3049 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3050 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3051 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3052 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3053 PetscFunctionReturn(0); 3054 } 3055 3056 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3057 { 3058 PetscErrorCode ierr; 3059 PetscSplitCSRDataStructure *d_mat = NULL; 3060 3061 PetscFunctionBegin; 3062 if (A->factortype == MAT_FACTOR_NONE) { 3063 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 3064 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 3065 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3066 } else { 3067 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3068 } 3069 if (d_mat) { 3070 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3071 cudaError_t err; 3072 PetscSplitCSRDataStructure h_mat; 3073 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 3074 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 3075 if (a->compressedrow.use) { 3076 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 3077 } 3078 err = cudaFree(d_mat);CHKERRCUDA(err); 3079 } 3080 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3081 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3082 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3083 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3084 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3085 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3086 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3087 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3088 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3089 PetscFunctionReturn(0); 3090 } 3091 3092 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3093 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3094 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3095 { 3096 PetscErrorCode ierr; 3097 3098 PetscFunctionBegin; 3099 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3100 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3101 PetscFunctionReturn(0); 3102 } 3103 3104 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3105 { 3106 PetscErrorCode ierr; 3107 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3108 Mat_SeqAIJCUSPARSE *cy; 3109 Mat_SeqAIJCUSPARSE *cx; 3110 PetscScalar *ay; 3111 const PetscScalar *ax; 3112 CsrMatrix *csry,*csrx; 3113 3114 PetscFunctionBegin; 3115 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3116 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3117 if (X->ops->axpy != Y->ops->axpy) { 3118 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3119 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3120 PetscFunctionReturn(0); 3121 } 3122 /* if we are here, it means both matrices are bound to GPU */ 3123 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3124 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3125 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3126 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3127 csry = (CsrMatrix*)cy->mat->mat; 3128 csrx = (CsrMatrix*)cx->mat->mat; 3129 /* see if we can turn this into a cublas axpy */ 3130 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3131 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3132 if (eq) { 3133 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3134 } 3135 if (eq) str = SAME_NONZERO_PATTERN; 3136 } 3137 /* spgeam is buggy with one column */ 3138 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3139 3140 if (str == SUBSET_NONZERO_PATTERN) { 3141 cusparseStatus_t stat; 3142 PetscScalar b = 1.0; 3143 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3144 size_t bufferSize; 3145 void *buffer; 3146 cudaError_t cerr; 3147 #endif 3148 3149 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3150 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3151 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3152 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3153 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3154 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3155 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3156 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3157 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3158 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3159 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3160 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3161 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3162 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3163 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3164 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3165 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3166 #else 3167 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3168 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3169 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3170 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3171 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3172 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3173 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3174 #endif 3175 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3176 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3177 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3178 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3179 } else if (str == SAME_NONZERO_PATTERN) { 3180 cublasHandle_t cublasv2handle; 3181 cublasStatus_t berr; 3182 PetscBLASInt one = 1, bnz = 1; 3183 3184 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3185 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3186 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3187 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3188 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3189 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3190 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3191 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3192 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3193 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3194 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3195 } else { 3196 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3197 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3198 } 3199 PetscFunctionReturn(0); 3200 } 3201 3202 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3203 { 3204 PetscErrorCode ierr; 3205 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3206 PetscScalar *ay; 3207 cublasHandle_t cublasv2handle; 3208 cublasStatus_t berr; 3209 PetscBLASInt one = 1, bnz = 1; 3210 3211 PetscFunctionBegin; 3212 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3213 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3214 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3215 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3216 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3217 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3218 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3219 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3220 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3221 PetscFunctionReturn(0); 3222 } 3223 3224 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3225 { 3226 PetscErrorCode ierr; 3227 PetscBool both = PETSC_FALSE; 3228 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3229 3230 PetscFunctionBegin; 3231 if (A->factortype == MAT_FACTOR_NONE) { 3232 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3233 if (spptr->mat) { 3234 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3235 if (matrix->values) { 3236 both = PETSC_TRUE; 3237 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3238 } 3239 } 3240 if (spptr->matTranspose) { 3241 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3242 if (matrix->values) { 3243 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3244 } 3245 } 3246 } 3247 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3248 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3249 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3250 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3251 else A->offloadmask = PETSC_OFFLOAD_CPU; 3252 3253 PetscFunctionReturn(0); 3254 } 3255 3256 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3257 { 3258 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3259 PetscErrorCode ierr; 3260 3261 PetscFunctionBegin; 3262 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3263 if (flg) { 3264 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3265 3266 A->ops->scale = MatScale_SeqAIJ; 3267 A->ops->axpy = MatAXPY_SeqAIJ; 3268 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3269 A->ops->mult = MatMult_SeqAIJ; 3270 A->ops->multadd = MatMultAdd_SeqAIJ; 3271 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3272 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3273 A->ops->multhermitiantranspose = NULL; 3274 A->ops->multhermitiantransposeadd = NULL; 3275 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3276 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3277 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3278 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3279 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3280 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3281 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3282 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3283 } else { 3284 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3285 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3286 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3287 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3288 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3289 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3290 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3291 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3292 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3293 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3294 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3295 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3296 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3297 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3298 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3299 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3300 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3301 } 3302 A->boundtocpu = flg; 3303 a->inode.use = flg; 3304 PetscFunctionReturn(0); 3305 } 3306 3307 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3308 { 3309 PetscErrorCode ierr; 3310 cusparseStatus_t stat; 3311 Mat B; 3312 3313 PetscFunctionBegin; 3314 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3315 if (reuse == MAT_INITIAL_MATRIX) { 3316 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3317 } else if (reuse == MAT_REUSE_MATRIX) { 3318 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3319 } 3320 B = *newmat; 3321 3322 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3323 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3324 3325 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3326 if (B->factortype == MAT_FACTOR_NONE) { 3327 Mat_SeqAIJCUSPARSE *spptr; 3328 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3329 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3330 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3331 spptr->format = MAT_CUSPARSE_CSR; 3332 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3333 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3334 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3335 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3336 #endif 3337 B->spptr = spptr; 3338 } else { 3339 Mat_SeqAIJCUSPARSETriFactors *spptr; 3340 3341 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3342 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3343 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3344 B->spptr = spptr; 3345 } 3346 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3347 } 3348 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3349 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3350 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3351 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3352 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3353 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3354 3355 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3356 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3357 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3358 PetscFunctionReturn(0); 3359 } 3360 3361 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3362 { 3363 PetscErrorCode ierr; 3364 3365 PetscFunctionBegin; 3366 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3367 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3368 PetscFunctionReturn(0); 3369 } 3370 3371 /*MC 3372 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3373 3374 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3375 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3376 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3377 3378 Options Database Keys: 3379 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3380 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3381 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3382 3383 Level: beginner 3384 3385 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3386 M*/ 3387 3388 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3389 3390 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3391 { 3392 PetscErrorCode ierr; 3393 3394 PetscFunctionBegin; 3395 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 3396 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3397 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3398 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3399 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3400 3401 PetscFunctionReturn(0); 3402 } 3403 3404 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3405 { 3406 PetscErrorCode ierr; 3407 cusparseStatus_t stat; 3408 3409 PetscFunctionBegin; 3410 if (*cusparsestruct) { 3411 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3412 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3413 delete (*cusparsestruct)->workVector; 3414 delete (*cusparsestruct)->rowoffsets_gpu; 3415 delete (*cusparsestruct)->cooPerm; 3416 delete (*cusparsestruct)->cooPerm_a; 3417 delete (*cusparsestruct)->csr2csc_i; 3418 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3419 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3420 } 3421 PetscFunctionReturn(0); 3422 } 3423 3424 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3425 { 3426 PetscFunctionBegin; 3427 if (*mat) { 3428 delete (*mat)->values; 3429 delete (*mat)->column_indices; 3430 delete (*mat)->row_offsets; 3431 delete *mat; 3432 *mat = 0; 3433 } 3434 PetscFunctionReturn(0); 3435 } 3436 3437 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3438 { 3439 cusparseStatus_t stat; 3440 PetscErrorCode ierr; 3441 3442 PetscFunctionBegin; 3443 if (*trifactor) { 3444 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3445 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3446 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3447 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3448 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3449 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3450 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3451 #endif 3452 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3453 } 3454 PetscFunctionReturn(0); 3455 } 3456 3457 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3458 { 3459 CsrMatrix *mat; 3460 cusparseStatus_t stat; 3461 cudaError_t err; 3462 3463 PetscFunctionBegin; 3464 if (*matstruct) { 3465 if ((*matstruct)->mat) { 3466 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3467 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3468 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3469 #else 3470 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3471 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3472 #endif 3473 } else { 3474 mat = (CsrMatrix*)(*matstruct)->mat; 3475 CsrMatrix_Destroy(&mat); 3476 } 3477 } 3478 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3479 delete (*matstruct)->cprowIndices; 3480 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3481 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3482 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3483 3484 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3485 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3486 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3487 for (int i=0; i<3; i++) { 3488 if (mdata->cuSpMV[i].initialized) { 3489 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3490 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3491 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3492 } 3493 } 3494 #endif 3495 delete *matstruct; 3496 *matstruct = NULL; 3497 } 3498 PetscFunctionReturn(0); 3499 } 3500 3501 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3502 { 3503 PetscErrorCode ierr; 3504 3505 PetscFunctionBegin; 3506 if (*trifactors) { 3507 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3508 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3509 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3510 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3511 delete (*trifactors)->rpermIndices; 3512 delete (*trifactors)->cpermIndices; 3513 delete (*trifactors)->workVector; 3514 (*trifactors)->rpermIndices = NULL; 3515 (*trifactors)->cpermIndices = NULL; 3516 (*trifactors)->workVector = NULL; 3517 if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3518 if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3519 (*trifactors)->init_dev_prop = PETSC_FALSE; 3520 } 3521 PetscFunctionReturn(0); 3522 } 3523 3524 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3525 { 3526 PetscErrorCode ierr; 3527 cusparseHandle_t handle; 3528 cusparseStatus_t stat; 3529 3530 PetscFunctionBegin; 3531 if (*trifactors) { 3532 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3533 if (handle = (*trifactors)->handle) { 3534 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3535 } 3536 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3537 } 3538 PetscFunctionReturn(0); 3539 } 3540 3541 struct IJCompare 3542 { 3543 __host__ __device__ 3544 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3545 { 3546 if (t1.get<0>() < t2.get<0>()) return true; 3547 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3548 return false; 3549 } 3550 }; 3551 3552 struct IJEqual 3553 { 3554 __host__ __device__ 3555 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3556 { 3557 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3558 return true; 3559 } 3560 }; 3561 3562 struct IJDiff 3563 { 3564 __host__ __device__ 3565 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3566 { 3567 return t1 == t2 ? 0 : 1; 3568 } 3569 }; 3570 3571 struct IJSum 3572 { 3573 __host__ __device__ 3574 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3575 { 3576 return t1||t2; 3577 } 3578 }; 3579 3580 #include <thrust/iterator/discard_iterator.h> 3581 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3582 { 3583 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3584 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3585 THRUSTARRAY *cooPerm_v = NULL; 3586 thrust::device_ptr<const PetscScalar> d_v; 3587 CsrMatrix *matrix; 3588 PetscErrorCode ierr; 3589 PetscInt n; 3590 3591 PetscFunctionBegin; 3592 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3593 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3594 if (!cusp->cooPerm) { 3595 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3596 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3597 PetscFunctionReturn(0); 3598 } 3599 matrix = (CsrMatrix*)cusp->mat->mat; 3600 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3601 if (!v) { 3602 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3603 goto finalize; 3604 } 3605 n = cusp->cooPerm->size(); 3606 if (isCudaMem(v)) { 3607 d_v = thrust::device_pointer_cast(v); 3608 } else { 3609 cooPerm_v = new THRUSTARRAY(n); 3610 cooPerm_v->assign(v,v+n); 3611 d_v = cooPerm_v->data(); 3612 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3613 } 3614 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3615 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3616 if (cusp->cooPerm_a) { 3617 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3618 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3619 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3620 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3621 delete cooPerm_w; 3622 } else { 3623 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3624 matrix->values->begin())); 3625 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3626 matrix->values->end())); 3627 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 3628 } 3629 } else { 3630 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3631 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3632 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3633 } else { 3634 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3635 matrix->values->begin())); 3636 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3637 matrix->values->end())); 3638 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3639 } 3640 } 3641 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3642 finalize: 3643 delete cooPerm_v; 3644 A->offloadmask = PETSC_OFFLOAD_GPU; 3645 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3646 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3647 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3648 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3649 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3650 a->reallocs = 0; 3651 A->info.mallocs += 0; 3652 A->info.nz_unneeded = 0; 3653 A->assembled = A->was_assembled = PETSC_TRUE; 3654 A->num_ass++; 3655 PetscFunctionReturn(0); 3656 } 3657 3658 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3659 { 3660 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3661 PetscErrorCode ierr; 3662 3663 PetscFunctionBegin; 3664 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3665 if (!cusp) PetscFunctionReturn(0); 3666 if (destroy) { 3667 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3668 delete cusp->csr2csc_i; 3669 cusp->csr2csc_i = NULL; 3670 } 3671 A->transupdated = PETSC_FALSE; 3672 PetscFunctionReturn(0); 3673 } 3674 3675 #include <thrust/binary_search.h> 3676 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3677 { 3678 PetscErrorCode ierr; 3679 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3680 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3681 PetscInt cooPerm_n, nzr = 0; 3682 cudaError_t cerr; 3683 3684 PetscFunctionBegin; 3685 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3686 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3687 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3688 if (n != cooPerm_n) { 3689 delete cusp->cooPerm; 3690 delete cusp->cooPerm_a; 3691 cusp->cooPerm = NULL; 3692 cusp->cooPerm_a = NULL; 3693 } 3694 if (n) { 3695 THRUSTINTARRAY d_i(n); 3696 THRUSTINTARRAY d_j(n); 3697 THRUSTINTARRAY ii(A->rmap->n); 3698 3699 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3700 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3701 3702 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3703 d_i.assign(coo_i,coo_i+n); 3704 d_j.assign(coo_j,coo_j+n); 3705 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3706 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3707 3708 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3709 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3710 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 3711 *cusp->cooPerm_a = d_i; 3712 THRUSTINTARRAY w = d_j; 3713 3714 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 3715 if (nekey == ekey) { /* all entries are unique */ 3716 delete cusp->cooPerm_a; 3717 cusp->cooPerm_a = NULL; 3718 } else { /* I couldn't come up with a more elegant algorithm */ 3719 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 3720 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 3721 (*cusp->cooPerm_a)[0] = 0; 3722 w[0] = 0; 3723 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 3724 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 3725 } 3726 thrust::counting_iterator<PetscInt> search_begin(0); 3727 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 3728 search_begin, search_begin + A->rmap->n, 3729 ii.begin()); 3730 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3731 3732 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3733 a->singlemalloc = PETSC_FALSE; 3734 a->free_a = PETSC_TRUE; 3735 a->free_ij = PETSC_TRUE; 3736 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3737 a->i[0] = 0; 3738 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3739 a->nz = a->maxnz = a->i[A->rmap->n]; 3740 a->rmax = 0; 3741 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3742 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3743 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3744 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3745 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3746 for (PetscInt i = 0; i < A->rmap->n; i++) { 3747 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3748 nzr += (PetscInt)!!(nnzr); 3749 a->ilen[i] = a->imax[i] = nnzr; 3750 a->rmax = PetscMax(a->rmax,nnzr); 3751 } 3752 a->nonzerorowcnt = nzr; 3753 A->preallocated = PETSC_TRUE; 3754 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3755 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3756 } else { 3757 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3758 } 3759 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3760 3761 /* We want to allocate the CUSPARSE struct for matvec now. 3762 The code is so convoluted now that I prefer to copy zeros */ 3763 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3764 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3765 A->offloadmask = PETSC_OFFLOAD_CPU; 3766 A->nonzerostate++; 3767 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3768 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 3769 3770 A->assembled = PETSC_FALSE; 3771 A->was_assembled = PETSC_FALSE; 3772 PetscFunctionReturn(0); 3773 } 3774 3775 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 3776 { 3777 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3778 CsrMatrix *csr; 3779 PetscErrorCode ierr; 3780 3781 PetscFunctionBegin; 3782 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3783 PetscValidPointer(a,2); 3784 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3785 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3786 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3787 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3788 csr = (CsrMatrix*)cusp->mat->mat; 3789 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3790 *a = csr->values->data().get(); 3791 PetscFunctionReturn(0); 3792 } 3793 3794 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 3795 { 3796 PetscFunctionBegin; 3797 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3798 PetscValidPointer(a,2); 3799 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3800 *a = NULL; 3801 PetscFunctionReturn(0); 3802 } 3803 3804 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 3805 { 3806 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3807 CsrMatrix *csr; 3808 PetscErrorCode ierr; 3809 3810 PetscFunctionBegin; 3811 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3812 PetscValidPointer(a,2); 3813 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3814 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3815 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3816 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3817 csr = (CsrMatrix*)cusp->mat->mat; 3818 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3819 *a = csr->values->data().get(); 3820 A->offloadmask = PETSC_OFFLOAD_GPU; 3821 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3822 PetscFunctionReturn(0); 3823 } 3824 3825 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 3826 { 3827 PetscErrorCode ierr; 3828 3829 PetscFunctionBegin; 3830 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3831 PetscValidPointer(a,2); 3832 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3833 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3834 *a = NULL; 3835 PetscFunctionReturn(0); 3836 } 3837 3838 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 3839 { 3840 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3841 CsrMatrix *csr; 3842 PetscErrorCode ierr; 3843 3844 PetscFunctionBegin; 3845 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3846 PetscValidPointer(a,2); 3847 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3848 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3849 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3850 csr = (CsrMatrix*)cusp->mat->mat; 3851 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3852 *a = csr->values->data().get(); 3853 A->offloadmask = PETSC_OFFLOAD_GPU; 3854 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3855 PetscFunctionReturn(0); 3856 } 3857 3858 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 3859 { 3860 PetscErrorCode ierr; 3861 3862 PetscFunctionBegin; 3863 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3864 PetscValidPointer(a,2); 3865 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3866 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3867 *a = NULL; 3868 PetscFunctionReturn(0); 3869 } 3870 3871 struct IJCompare4 3872 { 3873 __host__ __device__ 3874 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 3875 { 3876 if (t1.get<0>() < t2.get<0>()) return true; 3877 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3878 return false; 3879 } 3880 }; 3881 3882 struct Shift 3883 { 3884 int _shift; 3885 3886 Shift(int shift) : _shift(shift) {} 3887 __host__ __device__ 3888 inline int operator() (const int &c) 3889 { 3890 return c + _shift; 3891 } 3892 }; 3893 3894 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */ 3895 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 3896 { 3897 PetscErrorCode ierr; 3898 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 3899 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 3900 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 3901 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3902 PetscInt Annz,Bnnz; 3903 cusparseStatus_t stat; 3904 PetscInt i,m,n,zero = 0; 3905 cudaError_t cerr; 3906 3907 PetscFunctionBegin; 3908 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3909 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 3910 PetscValidPointer(C,4); 3911 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3912 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 3913 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 3914 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 3915 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3916 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3917 if (reuse == MAT_INITIAL_MATRIX) { 3918 m = A->rmap->n; 3919 n = A->cmap->n + B->cmap->n; 3920 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 3921 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 3922 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3923 c = (Mat_SeqAIJ*)(*C)->data; 3924 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3925 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3926 Ccsr = new CsrMatrix; 3927 Cmat->cprowIndices = NULL; 3928 c->compressedrow.use = PETSC_FALSE; 3929 c->compressedrow.nrows = 0; 3930 c->compressedrow.i = NULL; 3931 c->compressedrow.rindex = NULL; 3932 Ccusp->workVector = NULL; 3933 Ccusp->nrows = m; 3934 Ccusp->mat = Cmat; 3935 Ccusp->mat->mat = Ccsr; 3936 Ccsr->num_rows = m; 3937 Ccsr->num_cols = n; 3938 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 3939 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3940 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3941 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3942 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3943 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3944 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3945 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3946 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3947 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3948 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 3949 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 3950 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 3951 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3952 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3953 3954 Acsr = (CsrMatrix*)Acusp->mat->mat; 3955 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 3956 Annz = (PetscInt)Acsr->column_indices->size(); 3957 Bnnz = (PetscInt)Bcsr->column_indices->size(); 3958 c->nz = Annz + Bnnz; 3959 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 3960 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3961 Ccsr->values = new THRUSTARRAY(c->nz); 3962 Ccsr->num_entries = c->nz; 3963 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 3964 if (c->nz) { 3965 auto Acoo = new THRUSTINTARRAY32(Annz); 3966 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 3967 auto Ccoo = new THRUSTINTARRAY32(c->nz); 3968 THRUSTINTARRAY32 *Aroff,*Broff; 3969 3970 if (a->compressedrow.use) { /* need full row offset */ 3971 if (!Acusp->rowoffsets_gpu) { 3972 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 3973 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 3974 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3975 } 3976 Aroff = Acusp->rowoffsets_gpu; 3977 } else Aroff = Acsr->row_offsets; 3978 if (b->compressedrow.use) { /* need full row offset */ 3979 if (!Bcusp->rowoffsets_gpu) { 3980 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3981 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 3982 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3983 } 3984 Broff = Bcusp->rowoffsets_gpu; 3985 } else Broff = Bcsr->row_offsets; 3986 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3987 stat = cusparseXcsr2coo(Acusp->handle, 3988 Aroff->data().get(), 3989 Annz, 3990 m, 3991 Acoo->data().get(), 3992 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3993 stat = cusparseXcsr2coo(Bcusp->handle, 3994 Broff->data().get(), 3995 Bnnz, 3996 m, 3997 Bcoo->data().get(), 3998 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3999 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4000 auto Aperm = thrust::make_constant_iterator(1); 4001 auto Bperm = thrust::make_constant_iterator(0); 4002 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4003 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4004 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4005 #else 4006 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4007 auto Bcib = Bcsr->column_indices->begin(); 4008 auto Bcie = Bcsr->column_indices->end(); 4009 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4010 #endif 4011 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4012 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4013 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4014 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4015 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4016 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4017 auto p1 = Ccusp->cooPerm->begin(); 4018 auto p2 = Ccusp->cooPerm->begin(); 4019 thrust::advance(p2,Annz); 4020 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4021 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4022 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4023 #endif 4024 auto cci = thrust::make_counting_iterator(zero); 4025 auto cce = thrust::make_counting_iterator(c->nz); 4026 #if 0 //Errors on SUMMIT cuda 11.1.0 4027 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4028 #else 4029 auto pred = thrust::identity<int>(); 4030 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4031 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4032 #endif 4033 stat = cusparseXcoo2csr(Ccusp->handle, 4034 Ccoo->data().get(), 4035 c->nz, 4036 m, 4037 Ccsr->row_offsets->data().get(), 4038 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4039 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4040 delete wPerm; 4041 delete Acoo; 4042 delete Bcoo; 4043 delete Ccoo; 4044 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4045 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4046 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4047 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4048 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4049 #endif 4050 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4051 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4052 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4053 CsrMatrix *CcsrT = new CsrMatrix; 4054 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4055 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4056 4057 (*C)->form_explicit_transpose = PETSC_TRUE; 4058 (*C)->transupdated = PETSC_TRUE; 4059 Ccusp->rowoffsets_gpu = NULL; 4060 CmatT->cprowIndices = NULL; 4061 CmatT->mat = CcsrT; 4062 CcsrT->num_rows = n; 4063 CcsrT->num_cols = m; 4064 CcsrT->num_entries = c->nz; 4065 4066 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4067 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4068 CcsrT->values = new THRUSTARRAY(c->nz); 4069 4070 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4071 auto rT = CcsrT->row_offsets->begin(); 4072 if (AT) { 4073 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4074 thrust::advance(rT,-1); 4075 } 4076 if (BT) { 4077 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4078 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4079 thrust::copy(titb,tite,rT); 4080 } 4081 auto cT = CcsrT->column_indices->begin(); 4082 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4083 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4084 auto vT = CcsrT->values->begin(); 4085 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4086 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4087 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4088 4089 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4090 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4091 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4092 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4093 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4094 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4095 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4096 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4097 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4098 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4099 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4100 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4101 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4102 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4103 #endif 4104 Ccusp->matTranspose = CmatT; 4105 } 4106 } 4107 4108 c->singlemalloc = PETSC_FALSE; 4109 c->free_a = PETSC_TRUE; 4110 c->free_ij = PETSC_TRUE; 4111 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4112 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4113 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4114 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4115 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4116 ii = *Ccsr->row_offsets; 4117 jj = *Ccsr->column_indices; 4118 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4119 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4120 } else { 4121 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4122 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4123 } 4124 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4125 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4126 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4127 c->maxnz = c->nz; 4128 c->nonzerorowcnt = 0; 4129 c->rmax = 0; 4130 for (i = 0; i < m; i++) { 4131 const PetscInt nn = c->i[i+1] - c->i[i]; 4132 c->ilen[i] = c->imax[i] = nn; 4133 c->nonzerorowcnt += (PetscInt)!!nn; 4134 c->rmax = PetscMax(c->rmax,nn); 4135 } 4136 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4137 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4138 (*C)->nonzerostate++; 4139 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4140 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4141 Ccusp->nonzerostate = (*C)->nonzerostate; 4142 (*C)->preallocated = PETSC_TRUE; 4143 } else { 4144 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4145 c = (Mat_SeqAIJ*)(*C)->data; 4146 if (c->nz) { 4147 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4148 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4149 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4150 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4151 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4152 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4153 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4154 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4155 Acsr = (CsrMatrix*)Acusp->mat->mat; 4156 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4157 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4158 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4159 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4160 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4161 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4162 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4163 auto pmid = Ccusp->cooPerm->begin(); 4164 thrust::advance(pmid,Acsr->num_entries); 4165 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4166 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4167 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4168 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4169 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4170 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4171 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4172 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4173 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4174 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4175 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4176 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4177 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4178 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4179 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4180 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4181 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4182 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4183 auto vT = CcsrT->values->begin(); 4184 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4185 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4186 (*C)->transupdated = PETSC_TRUE; 4187 } 4188 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4189 } 4190 } 4191 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4192 (*C)->assembled = PETSC_TRUE; 4193 (*C)->was_assembled = PETSC_FALSE; 4194 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4195 PetscFunctionReturn(0); 4196 } 4197 4198 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4199 { 4200 PetscErrorCode ierr; 4201 bool dmem; 4202 const PetscScalar *av; 4203 cudaError_t cerr; 4204 4205 PetscFunctionBegin; 4206 dmem = isCudaMem(v); 4207 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4208 if (n && idx) { 4209 THRUSTINTARRAY widx(n); 4210 widx.assign(idx,idx+n); 4211 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4212 4213 THRUSTARRAY *w = NULL; 4214 thrust::device_ptr<PetscScalar> dv; 4215 if (dmem) { 4216 dv = thrust::device_pointer_cast(v); 4217 } else { 4218 w = new THRUSTARRAY(n); 4219 dv = w->data(); 4220 } 4221 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4222 4223 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4224 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4225 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4226 if (w) { 4227 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4228 } 4229 delete w; 4230 } else { 4231 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4232 } 4233 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4234 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4235 PetscFunctionReturn(0); 4236 } 4237