1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/async/for_each.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 76 77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 82 83 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 85 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 86 87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 89 90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 91 92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 93 { 94 cusparseStatus_t stat; 95 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 96 97 PetscFunctionBegin; 98 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 99 cusparsestruct->stream = stream; 100 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 101 PetscFunctionReturn(0); 102 } 103 104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 105 { 106 cusparseStatus_t stat; 107 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 108 109 PetscFunctionBegin; 110 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 111 if (cusparsestruct->handle != handle) { 112 if (cusparsestruct->handle) { 113 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 114 } 115 cusparsestruct->handle = handle; 116 } 117 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 118 PetscFunctionReturn(0); 119 } 120 121 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 122 { 123 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 124 PetscBool flg; 125 PetscErrorCode ierr; 126 127 PetscFunctionBegin; 128 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 129 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 130 if (cusparsestruct->handle) cusparsestruct->handle = 0; 131 PetscFunctionReturn(0); 132 } 133 134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 135 { 136 PetscFunctionBegin; 137 *type = MATSOLVERCUSPARSE; 138 PetscFunctionReturn(0); 139 } 140 141 /*MC 142 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 143 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 144 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 145 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 146 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 147 algorithms are not recommended. This class does NOT support direct solver operations. 148 149 Level: beginner 150 151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 152 M*/ 153 154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 155 { 156 PetscErrorCode ierr; 157 PetscInt n = A->rmap->n; 158 159 PetscFunctionBegin; 160 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 161 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 162 (*B)->factortype = ftype; 163 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 164 165 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 166 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 167 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 168 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 169 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 170 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 171 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 172 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 173 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 174 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 175 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 176 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 177 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 178 179 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 180 (*B)->canuseordering = PETSC_TRUE; 181 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 182 PetscFunctionReturn(0); 183 } 184 185 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 186 { 187 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 188 189 PetscFunctionBegin; 190 switch (op) { 191 case MAT_CUSPARSE_MULT: 192 cusparsestruct->format = format; 193 break; 194 case MAT_CUSPARSE_ALL: 195 cusparsestruct->format = format; 196 break; 197 default: 198 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 199 } 200 PetscFunctionReturn(0); 201 } 202 203 /*@ 204 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 205 operation. Only the MatMult operation can use different GPU storage formats 206 for MPIAIJCUSPARSE matrices. 207 Not Collective 208 209 Input Parameters: 210 + A - Matrix of type SEQAIJCUSPARSE 211 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 212 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 213 214 Output Parameter: 215 216 Level: intermediate 217 218 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 219 @*/ 220 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 221 { 222 PetscErrorCode ierr; 223 224 PetscFunctionBegin; 225 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 226 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 227 PetscFunctionReturn(0); 228 } 229 230 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 231 { 232 PetscErrorCode ierr; 233 234 PetscFunctionBegin; 235 switch (op) { 236 case MAT_FORM_EXPLICIT_TRANSPOSE: 237 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 238 if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 239 A->form_explicit_transpose = flg; 240 break; 241 default: 242 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 243 break; 244 } 245 PetscFunctionReturn(0); 246 } 247 248 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 249 250 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 251 { 252 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 253 IS isrow = b->row,iscol = b->col; 254 PetscBool row_identity,col_identity; 255 PetscErrorCode ierr; 256 257 PetscFunctionBegin; 258 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 259 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 260 B->offloadmask = PETSC_OFFLOAD_CPU; 261 /* determine which version of MatSolve needs to be used. */ 262 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 263 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 264 if (row_identity && col_identity) { 265 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 266 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 267 B->ops->matsolve = NULL; 268 B->ops->matsolvetranspose = NULL; 269 } else { 270 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 271 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 272 B->ops->matsolve = NULL; 273 B->ops->matsolvetranspose = NULL; 274 } 275 276 /* get the triangular factors */ 277 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 278 PetscFunctionReturn(0); 279 } 280 281 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 282 { 283 PetscErrorCode ierr; 284 MatCUSPARSEStorageFormat format; 285 PetscBool flg; 286 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 287 288 PetscFunctionBegin; 289 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 290 if (A->factortype == MAT_FACTOR_NONE) { 291 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 292 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 293 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 294 295 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 296 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 297 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 298 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 299 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 300 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 301 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 302 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 303 304 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 305 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 306 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 307 308 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 309 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 310 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 311 #endif 312 } 313 ierr = PetscOptionsTail();CHKERRQ(ierr); 314 PetscFunctionReturn(0); 315 } 316 317 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 318 { 319 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 320 PetscErrorCode ierr; 321 322 PetscFunctionBegin; 323 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 324 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 325 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 326 PetscFunctionReturn(0); 327 } 328 329 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 330 { 331 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 332 PetscErrorCode ierr; 333 334 PetscFunctionBegin; 335 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 336 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 337 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 338 PetscFunctionReturn(0); 339 } 340 341 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 342 { 343 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 344 PetscErrorCode ierr; 345 346 PetscFunctionBegin; 347 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 348 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 349 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 350 PetscFunctionReturn(0); 351 } 352 353 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 354 { 355 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 356 PetscErrorCode ierr; 357 358 PetscFunctionBegin; 359 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 360 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 361 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 362 PetscFunctionReturn(0); 363 } 364 365 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 366 { 367 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 368 PetscInt n = A->rmap->n; 369 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 370 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 371 cusparseStatus_t stat; 372 const PetscInt *ai = a->i,*aj = a->j,*vi; 373 const MatScalar *aa = a->a,*v; 374 PetscInt *AiLo, *AjLo; 375 PetscInt i,nz, nzLower, offset, rowOffset; 376 PetscErrorCode ierr; 377 cudaError_t cerr; 378 379 PetscFunctionBegin; 380 if (!n) PetscFunctionReturn(0); 381 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 382 try { 383 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 384 nzLower=n+ai[n]-ai[1]; 385 if (!loTriFactor) { 386 PetscScalar *AALo; 387 388 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 389 390 /* Allocate Space for the lower triangular matrix */ 391 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 392 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 393 394 /* Fill the lower triangular matrix */ 395 AiLo[0] = (PetscInt) 0; 396 AiLo[n] = nzLower; 397 AjLo[0] = (PetscInt) 0; 398 AALo[0] = (MatScalar) 1.0; 399 v = aa; 400 vi = aj; 401 offset = 1; 402 rowOffset= 1; 403 for (i=1; i<n; i++) { 404 nz = ai[i+1] - ai[i]; 405 /* additional 1 for the term on the diagonal */ 406 AiLo[i] = rowOffset; 407 rowOffset += nz+1; 408 409 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 410 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 411 412 offset += nz; 413 AjLo[offset] = (PetscInt) i; 414 AALo[offset] = (MatScalar) 1.0; 415 offset += 1; 416 417 v += nz; 418 vi += nz; 419 } 420 421 /* allocate space for the triangular factor information */ 422 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 423 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 424 /* Create the matrix description */ 425 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 426 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 427 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 428 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 429 #else 430 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 431 #endif 432 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 433 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 434 435 /* set the operation */ 436 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 437 438 /* set the matrix */ 439 loTriFactor->csrMat = new CsrMatrix; 440 loTriFactor->csrMat->num_rows = n; 441 loTriFactor->csrMat->num_cols = n; 442 loTriFactor->csrMat->num_entries = nzLower; 443 444 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 445 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 446 447 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 448 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 449 450 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 451 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 452 453 /* Create the solve analysis information */ 454 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 455 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 456 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 457 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 458 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 459 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 460 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 461 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 462 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 463 #endif 464 465 /* perform the solve analysis */ 466 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 467 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 468 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 469 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 470 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 471 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 472 #endif 473 );CHKERRCUSPARSE(stat); 474 cerr = WaitForCUDA();CHKERRCUDA(cerr); 475 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 476 477 /* assign the pointer */ 478 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 479 loTriFactor->AA_h = AALo; 480 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 481 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 482 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 483 } else { /* update values only */ 484 if (!loTriFactor->AA_h) { 485 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 486 } 487 /* Fill the lower triangular matrix */ 488 loTriFactor->AA_h[0] = 1.0; 489 v = aa; 490 vi = aj; 491 offset = 1; 492 for (i=1; i<n; i++) { 493 nz = ai[i+1] - ai[i]; 494 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 495 offset += nz; 496 loTriFactor->AA_h[offset] = 1.0; 497 offset += 1; 498 v += nz; 499 } 500 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 501 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 502 } 503 } catch(char *ex) { 504 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 505 } 506 } 507 PetscFunctionReturn(0); 508 } 509 510 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 511 { 512 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 513 PetscInt n = A->rmap->n; 514 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 515 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 516 cusparseStatus_t stat; 517 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 518 const MatScalar *aa = a->a,*v; 519 PetscInt *AiUp, *AjUp; 520 PetscInt i,nz, nzUpper, offset; 521 PetscErrorCode ierr; 522 cudaError_t cerr; 523 524 PetscFunctionBegin; 525 if (!n) PetscFunctionReturn(0); 526 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 527 try { 528 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 529 nzUpper = adiag[0]-adiag[n]; 530 if (!upTriFactor) { 531 PetscScalar *AAUp; 532 533 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 534 535 /* Allocate Space for the upper triangular matrix */ 536 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 537 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 538 539 /* Fill the upper triangular matrix */ 540 AiUp[0]=(PetscInt) 0; 541 AiUp[n]=nzUpper; 542 offset = nzUpper; 543 for (i=n-1; i>=0; i--) { 544 v = aa + adiag[i+1] + 1; 545 vi = aj + adiag[i+1] + 1; 546 547 /* number of elements NOT on the diagonal */ 548 nz = adiag[i] - adiag[i+1]-1; 549 550 /* decrement the offset */ 551 offset -= (nz+1); 552 553 /* first, set the diagonal elements */ 554 AjUp[offset] = (PetscInt) i; 555 AAUp[offset] = (MatScalar)1./v[nz]; 556 AiUp[i] = AiUp[i+1] - (nz+1); 557 558 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 559 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 560 } 561 562 /* allocate space for the triangular factor information */ 563 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 564 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 565 566 /* Create the matrix description */ 567 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 568 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 569 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 570 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 571 #else 572 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 573 #endif 574 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 575 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 576 577 /* set the operation */ 578 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 579 580 /* set the matrix */ 581 upTriFactor->csrMat = new CsrMatrix; 582 upTriFactor->csrMat->num_rows = n; 583 upTriFactor->csrMat->num_cols = n; 584 upTriFactor->csrMat->num_entries = nzUpper; 585 586 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 587 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 588 589 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 590 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 591 592 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 593 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 594 595 /* Create the solve analysis information */ 596 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 597 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 598 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 599 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 600 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 601 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 602 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 603 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 604 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 605 #endif 606 607 /* perform the solve analysis */ 608 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 609 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 610 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 611 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 612 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 613 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 614 #endif 615 );CHKERRCUSPARSE(stat); 616 cerr = WaitForCUDA();CHKERRCUDA(cerr); 617 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 618 619 /* assign the pointer */ 620 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 621 upTriFactor->AA_h = AAUp; 622 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 623 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 624 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 625 } else { 626 if (!upTriFactor->AA_h) { 627 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 628 } 629 /* Fill the upper triangular matrix */ 630 offset = nzUpper; 631 for (i=n-1; i>=0; i--) { 632 v = aa + adiag[i+1] + 1; 633 634 /* number of elements NOT on the diagonal */ 635 nz = adiag[i] - adiag[i+1]-1; 636 637 /* decrement the offset */ 638 offset -= (nz+1); 639 640 /* first, set the diagonal elements */ 641 upTriFactor->AA_h[offset] = 1./v[nz]; 642 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 643 } 644 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 645 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 646 } 647 } catch(char *ex) { 648 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 649 } 650 } 651 PetscFunctionReturn(0); 652 } 653 654 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 655 { 656 PetscErrorCode ierr; 657 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 658 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 659 IS isrow = a->row,iscol = a->icol; 660 PetscBool row_identity,col_identity; 661 PetscInt n = A->rmap->n; 662 663 PetscFunctionBegin; 664 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 665 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 666 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 667 668 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 669 cusparseTriFactors->nnz=a->nz; 670 671 A->offloadmask = PETSC_OFFLOAD_BOTH; 672 /* lower triangular indices */ 673 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 674 if (!row_identity && !cusparseTriFactors->rpermIndices) { 675 const PetscInt *r; 676 677 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 678 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 679 cusparseTriFactors->rpermIndices->assign(r, r+n); 680 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 681 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 682 } 683 684 /* upper triangular indices */ 685 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 686 if (!col_identity && !cusparseTriFactors->cpermIndices) { 687 const PetscInt *c; 688 689 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 690 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 691 cusparseTriFactors->cpermIndices->assign(c, c+n); 692 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 693 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 694 } 695 PetscFunctionReturn(0); 696 } 697 698 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 699 { 700 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 701 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 702 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 703 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 704 cusparseStatus_t stat; 705 PetscErrorCode ierr; 706 cudaError_t cerr; 707 PetscInt *AiUp, *AjUp; 708 PetscScalar *AAUp; 709 PetscScalar *AALo; 710 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 711 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 712 const PetscInt *ai = b->i,*aj = b->j,*vj; 713 const MatScalar *aa = b->a,*v; 714 715 PetscFunctionBegin; 716 if (!n) PetscFunctionReturn(0); 717 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 718 try { 719 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 720 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 721 if (!upTriFactor && !loTriFactor) { 722 /* Allocate Space for the upper triangular matrix */ 723 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 724 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 725 726 /* Fill the upper triangular matrix */ 727 AiUp[0]=(PetscInt) 0; 728 AiUp[n]=nzUpper; 729 offset = 0; 730 for (i=0; i<n; i++) { 731 /* set the pointers */ 732 v = aa + ai[i]; 733 vj = aj + ai[i]; 734 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 735 736 /* first, set the diagonal elements */ 737 AjUp[offset] = (PetscInt) i; 738 AAUp[offset] = (MatScalar)1.0/v[nz]; 739 AiUp[i] = offset; 740 AALo[offset] = (MatScalar)1.0/v[nz]; 741 742 offset+=1; 743 if (nz>0) { 744 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 745 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 746 for (j=offset; j<offset+nz; j++) { 747 AAUp[j] = -AAUp[j]; 748 AALo[j] = AAUp[j]/v[nz]; 749 } 750 offset+=nz; 751 } 752 } 753 754 /* allocate space for the triangular factor information */ 755 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 756 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 757 758 /* Create the matrix description */ 759 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 760 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 761 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 762 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 763 #else 764 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 765 #endif 766 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 767 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 768 769 /* set the matrix */ 770 upTriFactor->csrMat = new CsrMatrix; 771 upTriFactor->csrMat->num_rows = A->rmap->n; 772 upTriFactor->csrMat->num_cols = A->cmap->n; 773 upTriFactor->csrMat->num_entries = a->nz; 774 775 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 776 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 777 778 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 779 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 780 781 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 782 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 783 784 /* set the operation */ 785 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 786 787 /* Create the solve analysis information */ 788 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 789 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 790 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 791 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 792 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 793 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 794 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 795 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 796 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 797 #endif 798 799 /* perform the solve analysis */ 800 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 801 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 802 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 803 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 804 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 805 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 806 #endif 807 );CHKERRCUSPARSE(stat); 808 cerr = WaitForCUDA();CHKERRCUDA(cerr); 809 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 810 811 /* assign the pointer */ 812 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 813 814 /* allocate space for the triangular factor information */ 815 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 816 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 817 818 /* Create the matrix description */ 819 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 820 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 821 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 822 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 823 #else 824 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 825 #endif 826 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 827 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 828 829 /* set the operation */ 830 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 831 832 /* set the matrix */ 833 loTriFactor->csrMat = new CsrMatrix; 834 loTriFactor->csrMat->num_rows = A->rmap->n; 835 loTriFactor->csrMat->num_cols = A->cmap->n; 836 loTriFactor->csrMat->num_entries = a->nz; 837 838 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 839 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 840 841 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 842 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 843 844 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 845 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 846 847 /* Create the solve analysis information */ 848 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 849 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 850 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 851 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 852 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 853 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 854 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 855 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 856 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 857 #endif 858 859 /* perform the solve analysis */ 860 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 861 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 862 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 863 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 864 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 865 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 866 #endif 867 );CHKERRCUSPARSE(stat); 868 cerr = WaitForCUDA();CHKERRCUDA(cerr); 869 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 870 871 /* assign the pointer */ 872 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 873 874 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 875 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 876 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 877 } else { 878 /* Fill the upper triangular matrix */ 879 offset = 0; 880 for (i=0; i<n; i++) { 881 /* set the pointers */ 882 v = aa + ai[i]; 883 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 884 885 /* first, set the diagonal elements */ 886 AAUp[offset] = 1.0/v[nz]; 887 AALo[offset] = 1.0/v[nz]; 888 889 offset+=1; 890 if (nz>0) { 891 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 892 for (j=offset; j<offset+nz; j++) { 893 AAUp[j] = -AAUp[j]; 894 AALo[j] = AAUp[j]/v[nz]; 895 } 896 offset+=nz; 897 } 898 } 899 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 900 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 901 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 902 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 903 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 904 } 905 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 906 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 907 } catch(char *ex) { 908 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 909 } 910 } 911 PetscFunctionReturn(0); 912 } 913 914 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 915 { 916 PetscErrorCode ierr; 917 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 918 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 919 IS ip = a->row; 920 PetscBool perm_identity; 921 PetscInt n = A->rmap->n; 922 923 PetscFunctionBegin; 924 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 925 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 926 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 927 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 928 929 A->offloadmask = PETSC_OFFLOAD_BOTH; 930 931 /* lower triangular indices */ 932 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 933 if (!perm_identity) { 934 IS iip; 935 const PetscInt *irip,*rip; 936 937 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 938 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 939 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 940 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 941 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 942 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 943 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 944 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 945 ierr = ISDestroy(&iip);CHKERRQ(ierr); 946 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 947 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 948 } 949 PetscFunctionReturn(0); 950 } 951 952 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 953 { 954 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 955 IS ip = b->row; 956 PetscBool perm_identity; 957 PetscErrorCode ierr; 958 959 PetscFunctionBegin; 960 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 961 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 962 B->offloadmask = PETSC_OFFLOAD_CPU; 963 /* determine which version of MatSolve needs to be used. */ 964 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 965 if (perm_identity) { 966 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 967 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 968 B->ops->matsolve = NULL; 969 B->ops->matsolvetranspose = NULL; 970 } else { 971 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 972 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 973 B->ops->matsolve = NULL; 974 B->ops->matsolvetranspose = NULL; 975 } 976 977 /* get the triangular factors */ 978 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 979 PetscFunctionReturn(0); 980 } 981 982 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 983 { 984 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 985 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 986 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 987 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 988 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 989 cusparseStatus_t stat; 990 cusparseIndexBase_t indexBase; 991 cusparseMatrixType_t matrixType; 992 cusparseFillMode_t fillMode; 993 cusparseDiagType_t diagType; 994 cudaError_t cerr; 995 PetscErrorCode ierr; 996 997 PetscFunctionBegin; 998 /* allocate space for the transpose of the lower triangular factor */ 999 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1000 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1001 1002 /* set the matrix descriptors of the lower triangular factor */ 1003 matrixType = cusparseGetMatType(loTriFactor->descr); 1004 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1005 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1006 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1007 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1008 1009 /* Create the matrix description */ 1010 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1011 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1012 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1013 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1014 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1015 1016 /* set the operation */ 1017 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1018 1019 /* allocate GPU space for the CSC of the lower triangular factor*/ 1020 loTriFactorT->csrMat = new CsrMatrix; 1021 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1022 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1023 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1024 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1025 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1026 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1027 1028 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1029 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1030 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1031 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1032 loTriFactor->csrMat->values->data().get(), 1033 loTriFactor->csrMat->row_offsets->data().get(), 1034 loTriFactor->csrMat->column_indices->data().get(), 1035 loTriFactorT->csrMat->values->data().get(), 1036 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1037 CUSPARSE_ACTION_NUMERIC,indexBase, 1038 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1039 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1040 #endif 1041 1042 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1043 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1044 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1045 loTriFactor->csrMat->values->data().get(), 1046 loTriFactor->csrMat->row_offsets->data().get(), 1047 loTriFactor->csrMat->column_indices->data().get(), 1048 loTriFactorT->csrMat->values->data().get(), 1049 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1050 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1051 CUSPARSE_ACTION_NUMERIC, indexBase, 1052 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 1053 #else 1054 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1055 CUSPARSE_ACTION_NUMERIC, indexBase 1056 #endif 1057 );CHKERRCUSPARSE(stat); 1058 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1059 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1060 1061 /* Create the solve analysis information */ 1062 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1063 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1064 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1065 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1066 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1067 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1068 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1069 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1070 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1071 #endif 1072 1073 /* perform the solve analysis */ 1074 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1075 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1076 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1077 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 1078 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1079 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1080 #endif 1081 );CHKERRCUSPARSE(stat); 1082 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1083 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1084 1085 /* assign the pointer */ 1086 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1087 1088 /*********************************************/ 1089 /* Now the Transpose of the Upper Tri Factor */ 1090 /*********************************************/ 1091 1092 /* allocate space for the transpose of the upper triangular factor */ 1093 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1094 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1095 1096 /* set the matrix descriptors of the upper triangular factor */ 1097 matrixType = cusparseGetMatType(upTriFactor->descr); 1098 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1099 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1100 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1101 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1102 1103 /* Create the matrix description */ 1104 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1105 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1106 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1107 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1108 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1109 1110 /* set the operation */ 1111 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1112 1113 /* allocate GPU space for the CSC of the upper triangular factor*/ 1114 upTriFactorT->csrMat = new CsrMatrix; 1115 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1116 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1117 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1118 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1119 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1120 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1121 1122 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1123 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1124 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1125 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1126 upTriFactor->csrMat->values->data().get(), 1127 upTriFactor->csrMat->row_offsets->data().get(), 1128 upTriFactor->csrMat->column_indices->data().get(), 1129 upTriFactorT->csrMat->values->data().get(), 1130 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1131 CUSPARSE_ACTION_NUMERIC,indexBase, 1132 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1133 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1134 #endif 1135 1136 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1137 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1138 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1139 upTriFactor->csrMat->values->data().get(), 1140 upTriFactor->csrMat->row_offsets->data().get(), 1141 upTriFactor->csrMat->column_indices->data().get(), 1142 upTriFactorT->csrMat->values->data().get(), 1143 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1144 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1145 CUSPARSE_ACTION_NUMERIC, indexBase, 1146 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1147 #else 1148 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1149 CUSPARSE_ACTION_NUMERIC, indexBase 1150 #endif 1151 );CHKERRCUSPARSE(stat); 1152 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1153 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1154 1155 /* Create the solve analysis information */ 1156 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1157 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1158 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1159 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1160 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1161 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1162 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1163 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1164 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1165 #endif 1166 1167 /* perform the solve analysis */ 1168 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1169 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1170 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1171 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1172 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1173 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1174 #endif 1175 );CHKERRCUSPARSE(stat); 1176 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1177 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1178 1179 /* assign the pointer */ 1180 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1181 PetscFunctionReturn(0); 1182 } 1183 1184 struct PetscScalarToPetscInt 1185 { 1186 __host__ __device__ 1187 PetscInt operator()(PetscScalar s) 1188 { 1189 return (PetscInt)PetscRealPart(s); 1190 } 1191 }; 1192 1193 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A) 1194 { 1195 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1196 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1197 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1198 cusparseStatus_t stat; 1199 cusparseIndexBase_t indexBase; 1200 cudaError_t err; 1201 PetscErrorCode ierr; 1202 1203 PetscFunctionBegin; 1204 if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 1205 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1206 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1207 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1208 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1209 if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1210 if (A->transupdated) PetscFunctionReturn(0); 1211 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1212 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1213 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1214 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1215 } 1216 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1217 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1218 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1219 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1220 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1221 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1222 1223 /* set alpha and beta */ 1224 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1225 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1226 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1227 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1228 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1229 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1230 1231 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1232 CsrMatrix *matrixT = new CsrMatrix; 1233 matstructT->mat = matrixT; 1234 matrixT->num_rows = A->cmap->n; 1235 matrixT->num_cols = A->rmap->n; 1236 matrixT->num_entries = a->nz; 1237 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1238 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1239 matrixT->values = new THRUSTARRAY(a->nz); 1240 1241 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1242 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1243 1244 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1245 stat = cusparseCreateCsr(&matstructT->matDescr, 1246 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1247 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1248 matrixT->values->data().get(), 1249 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1250 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1251 #endif 1252 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1253 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1254 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1255 #else 1256 CsrMatrix *temp = new CsrMatrix; 1257 CsrMatrix *tempT = new CsrMatrix; 1258 /* First convert HYB to CSR */ 1259 temp->num_rows = A->rmap->n; 1260 temp->num_cols = A->cmap->n; 1261 temp->num_entries = a->nz; 1262 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1263 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1264 temp->values = new THRUSTARRAY(a->nz); 1265 1266 stat = cusparse_hyb2csr(cusparsestruct->handle, 1267 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1268 temp->values->data().get(), 1269 temp->row_offsets->data().get(), 1270 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1271 1272 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1273 tempT->num_rows = A->rmap->n; 1274 tempT->num_cols = A->cmap->n; 1275 tempT->num_entries = a->nz; 1276 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1277 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1278 tempT->values = new THRUSTARRAY(a->nz); 1279 1280 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1281 temp->num_cols, temp->num_entries, 1282 temp->values->data().get(), 1283 temp->row_offsets->data().get(), 1284 temp->column_indices->data().get(), 1285 tempT->values->data().get(), 1286 tempT->column_indices->data().get(), 1287 tempT->row_offsets->data().get(), 1288 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1289 1290 /* Last, convert CSC to HYB */ 1291 cusparseHybMat_t hybMat; 1292 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1293 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1294 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1295 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1296 matstructT->descr, tempT->values->data().get(), 1297 tempT->row_offsets->data().get(), 1298 tempT->column_indices->data().get(), 1299 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1300 1301 /* assign the pointer */ 1302 matstructT->mat = hybMat; 1303 A->transupdated = PETSC_TRUE; 1304 /* delete temporaries */ 1305 if (tempT) { 1306 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1307 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1308 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1309 delete (CsrMatrix*) tempT; 1310 } 1311 if (temp) { 1312 if (temp->values) delete (THRUSTARRAY*) temp->values; 1313 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1314 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1315 delete (CsrMatrix*) temp; 1316 } 1317 #endif 1318 } 1319 } 1320 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1321 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1322 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1323 if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1324 if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1325 if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1326 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1327 if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1328 if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1329 if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1330 if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1331 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1332 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1333 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1334 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1335 } 1336 if (!cusparsestruct->csr2csc_i) { 1337 THRUSTARRAY csr2csc_a(matrix->num_entries); 1338 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1339 1340 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1341 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1342 void *csr2cscBuffer; 1343 size_t csr2cscBufferSize; 1344 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1345 A->cmap->n, matrix->num_entries, 1346 matrix->values->data().get(), 1347 cusparsestruct->rowoffsets_gpu->data().get(), 1348 matrix->column_indices->data().get(), 1349 matrixT->values->data().get(), 1350 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1351 CUSPARSE_ACTION_NUMERIC,indexBase, 1352 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1353 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1354 #endif 1355 1356 if (matrix->num_entries) { 1357 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1358 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1359 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1360 1361 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1362 should be filled with indexBase. So I just take a shortcut here. 1363 */ 1364 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1365 A->cmap->n,matrix->num_entries, 1366 csr2csc_a.data().get(), 1367 cusparsestruct->rowoffsets_gpu->data().get(), 1368 matrix->column_indices->data().get(), 1369 matrixT->values->data().get(), 1370 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1371 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1372 CUSPARSE_ACTION_NUMERIC,indexBase, 1373 cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1374 #else 1375 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1376 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1377 #endif 1378 } else { 1379 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1380 } 1381 1382 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1383 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1384 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1385 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1386 #endif 1387 } 1388 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1389 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1390 matrixT->values->begin())); 1391 } 1392 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1393 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1394 /* the compressed row indices is not used for matTranspose */ 1395 matstructT->cprowIndices = NULL; 1396 /* assign the pointer */ 1397 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1398 A->transupdated = PETSC_TRUE; 1399 PetscFunctionReturn(0); 1400 } 1401 1402 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1403 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1404 { 1405 PetscInt n = xx->map->n; 1406 const PetscScalar *barray; 1407 PetscScalar *xarray; 1408 thrust::device_ptr<const PetscScalar> bGPU; 1409 thrust::device_ptr<PetscScalar> xGPU; 1410 cusparseStatus_t stat; 1411 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1412 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1413 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1414 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1415 PetscErrorCode ierr; 1416 1417 PetscFunctionBegin; 1418 /* Analyze the matrix and create the transpose ... on the fly */ 1419 if (!loTriFactorT && !upTriFactorT) { 1420 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1421 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1422 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1423 } 1424 1425 /* Get the GPU pointers */ 1426 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1427 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1428 xGPU = thrust::device_pointer_cast(xarray); 1429 bGPU = thrust::device_pointer_cast(barray); 1430 1431 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1432 /* First, reorder with the row permutation */ 1433 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1434 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1435 xGPU); 1436 1437 /* First, solve U */ 1438 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1439 upTriFactorT->csrMat->num_rows, 1440 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1441 upTriFactorT->csrMat->num_entries, 1442 #endif 1443 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1444 upTriFactorT->csrMat->values->data().get(), 1445 upTriFactorT->csrMat->row_offsets->data().get(), 1446 upTriFactorT->csrMat->column_indices->data().get(), 1447 upTriFactorT->solveInfo, 1448 xarray, tempGPU->data().get() 1449 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1450 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1451 #endif 1452 );CHKERRCUSPARSE(stat); 1453 1454 /* Then, solve L */ 1455 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1456 loTriFactorT->csrMat->num_rows, 1457 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1458 loTriFactorT->csrMat->num_entries, 1459 #endif 1460 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1461 loTriFactorT->csrMat->values->data().get(), 1462 loTriFactorT->csrMat->row_offsets->data().get(), 1463 loTriFactorT->csrMat->column_indices->data().get(), 1464 loTriFactorT->solveInfo, 1465 tempGPU->data().get(), xarray 1466 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1467 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1468 #endif 1469 );CHKERRCUSPARSE(stat); 1470 1471 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1472 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1473 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1474 tempGPU->begin()); 1475 1476 /* Copy the temporary to the full solution. */ 1477 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1478 1479 /* restore */ 1480 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1481 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1482 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1483 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1484 PetscFunctionReturn(0); 1485 } 1486 1487 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1488 { 1489 const PetscScalar *barray; 1490 PetscScalar *xarray; 1491 cusparseStatus_t stat; 1492 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1493 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1494 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1495 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1496 PetscErrorCode ierr; 1497 1498 PetscFunctionBegin; 1499 /* Analyze the matrix and create the transpose ... on the fly */ 1500 if (!loTriFactorT && !upTriFactorT) { 1501 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1502 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1503 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1504 } 1505 1506 /* Get the GPU pointers */ 1507 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1508 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1509 1510 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1511 /* First, solve U */ 1512 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1513 upTriFactorT->csrMat->num_rows, 1514 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1515 upTriFactorT->csrMat->num_entries, 1516 #endif 1517 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1518 upTriFactorT->csrMat->values->data().get(), 1519 upTriFactorT->csrMat->row_offsets->data().get(), 1520 upTriFactorT->csrMat->column_indices->data().get(), 1521 upTriFactorT->solveInfo, 1522 barray, tempGPU->data().get() 1523 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1524 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1525 #endif 1526 );CHKERRCUSPARSE(stat); 1527 1528 /* Then, solve L */ 1529 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1530 loTriFactorT->csrMat->num_rows, 1531 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1532 loTriFactorT->csrMat->num_entries, 1533 #endif 1534 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1535 loTriFactorT->csrMat->values->data().get(), 1536 loTriFactorT->csrMat->row_offsets->data().get(), 1537 loTriFactorT->csrMat->column_indices->data().get(), 1538 loTriFactorT->solveInfo, 1539 tempGPU->data().get(), xarray 1540 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1541 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1542 #endif 1543 );CHKERRCUSPARSE(stat); 1544 1545 /* restore */ 1546 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1547 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1548 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1549 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1550 PetscFunctionReturn(0); 1551 } 1552 1553 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1554 { 1555 const PetscScalar *barray; 1556 PetscScalar *xarray; 1557 thrust::device_ptr<const PetscScalar> bGPU; 1558 thrust::device_ptr<PetscScalar> xGPU; 1559 cusparseStatus_t stat; 1560 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1561 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1562 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1563 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1564 PetscErrorCode ierr; 1565 1566 PetscFunctionBegin; 1567 1568 /* Get the GPU pointers */ 1569 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1570 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1571 xGPU = thrust::device_pointer_cast(xarray); 1572 bGPU = thrust::device_pointer_cast(barray); 1573 1574 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1575 /* First, reorder with the row permutation */ 1576 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1577 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1578 tempGPU->begin()); 1579 1580 /* Next, solve L */ 1581 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1582 loTriFactor->csrMat->num_rows, 1583 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1584 loTriFactor->csrMat->num_entries, 1585 #endif 1586 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1587 loTriFactor->csrMat->values->data().get(), 1588 loTriFactor->csrMat->row_offsets->data().get(), 1589 loTriFactor->csrMat->column_indices->data().get(), 1590 loTriFactor->solveInfo, 1591 tempGPU->data().get(), xarray 1592 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1593 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1594 #endif 1595 );CHKERRCUSPARSE(stat); 1596 1597 /* Then, solve U */ 1598 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1599 upTriFactor->csrMat->num_rows, 1600 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1601 upTriFactor->csrMat->num_entries, 1602 #endif 1603 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1604 upTriFactor->csrMat->values->data().get(), 1605 upTriFactor->csrMat->row_offsets->data().get(), 1606 upTriFactor->csrMat->column_indices->data().get(), 1607 upTriFactor->solveInfo, 1608 xarray, tempGPU->data().get() 1609 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1610 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1611 #endif 1612 );CHKERRCUSPARSE(stat); 1613 1614 /* Last, reorder with the column permutation */ 1615 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1616 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1617 xGPU); 1618 1619 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1620 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1621 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1622 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1623 PetscFunctionReturn(0); 1624 } 1625 1626 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1627 { 1628 const PetscScalar *barray; 1629 PetscScalar *xarray; 1630 cusparseStatus_t stat; 1631 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1632 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1633 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1634 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1635 PetscErrorCode ierr; 1636 1637 PetscFunctionBegin; 1638 /* Get the GPU pointers */ 1639 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1640 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1641 1642 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1643 /* First, solve L */ 1644 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1645 loTriFactor->csrMat->num_rows, 1646 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1647 loTriFactor->csrMat->num_entries, 1648 #endif 1649 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1650 loTriFactor->csrMat->values->data().get(), 1651 loTriFactor->csrMat->row_offsets->data().get(), 1652 loTriFactor->csrMat->column_indices->data().get(), 1653 loTriFactor->solveInfo, 1654 barray, tempGPU->data().get() 1655 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1656 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1657 #endif 1658 );CHKERRCUSPARSE(stat); 1659 1660 /* Next, solve U */ 1661 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1662 upTriFactor->csrMat->num_rows, 1663 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1664 upTriFactor->csrMat->num_entries, 1665 #endif 1666 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1667 upTriFactor->csrMat->values->data().get(), 1668 upTriFactor->csrMat->row_offsets->data().get(), 1669 upTriFactor->csrMat->column_indices->data().get(), 1670 upTriFactor->solveInfo, 1671 tempGPU->data().get(), xarray 1672 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1673 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1674 #endif 1675 );CHKERRCUSPARSE(stat); 1676 1677 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1678 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1679 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1680 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1681 PetscFunctionReturn(0); 1682 } 1683 1684 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1685 { 1686 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1687 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1688 cudaError_t cerr; 1689 PetscErrorCode ierr; 1690 1691 PetscFunctionBegin; 1692 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1693 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1694 1695 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1696 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1697 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1698 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1699 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1700 A->offloadmask = PETSC_OFFLOAD_BOTH; 1701 } 1702 PetscFunctionReturn(0); 1703 } 1704 1705 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1706 { 1707 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1708 PetscErrorCode ierr; 1709 1710 PetscFunctionBegin; 1711 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1712 *array = a->a; 1713 A->offloadmask = PETSC_OFFLOAD_CPU; 1714 PetscFunctionReturn(0); 1715 } 1716 1717 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1718 { 1719 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1720 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1721 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1722 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1723 PetscErrorCode ierr; 1724 cusparseStatus_t stat; 1725 PetscBool both = PETSC_TRUE; 1726 cudaError_t err; 1727 1728 PetscFunctionBegin; 1729 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1730 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1731 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1732 CsrMatrix *matrix; 1733 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1734 1735 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1736 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1737 matrix->values->assign(a->a, a->a+a->nz); 1738 err = WaitForCUDA();CHKERRCUDA(err); 1739 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1740 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1741 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1742 } else { 1743 PetscInt nnz; 1744 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1745 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1746 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1747 delete cusparsestruct->workVector; 1748 delete cusparsestruct->rowoffsets_gpu; 1749 cusparsestruct->workVector = NULL; 1750 cusparsestruct->rowoffsets_gpu = NULL; 1751 try { 1752 if (a->compressedrow.use) { 1753 m = a->compressedrow.nrows; 1754 ii = a->compressedrow.i; 1755 ridx = a->compressedrow.rindex; 1756 } else { 1757 m = A->rmap->n; 1758 ii = a->i; 1759 ridx = NULL; 1760 } 1761 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1762 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1763 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1764 else nnz = a->nz; 1765 1766 /* create cusparse matrix */ 1767 cusparsestruct->nrows = m; 1768 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1769 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1770 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1771 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1772 1773 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1774 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1775 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1776 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1777 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1778 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1779 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1780 1781 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1782 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1783 /* set the matrix */ 1784 CsrMatrix *mat= new CsrMatrix; 1785 mat->num_rows = m; 1786 mat->num_cols = A->cmap->n; 1787 mat->num_entries = nnz; 1788 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1789 mat->row_offsets->assign(ii, ii + m+1); 1790 1791 mat->column_indices = new THRUSTINTARRAY32(nnz); 1792 mat->column_indices->assign(a->j, a->j+nnz); 1793 1794 mat->values = new THRUSTARRAY(nnz); 1795 if (a->a) mat->values->assign(a->a, a->a+nnz); 1796 1797 /* assign the pointer */ 1798 matstruct->mat = mat; 1799 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1800 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1801 stat = cusparseCreateCsr(&matstruct->matDescr, 1802 mat->num_rows, mat->num_cols, mat->num_entries, 1803 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1804 mat->values->data().get(), 1805 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1806 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1807 } 1808 #endif 1809 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1810 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1811 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1812 #else 1813 CsrMatrix *mat= new CsrMatrix; 1814 mat->num_rows = m; 1815 mat->num_cols = A->cmap->n; 1816 mat->num_entries = nnz; 1817 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1818 mat->row_offsets->assign(ii, ii + m+1); 1819 1820 mat->column_indices = new THRUSTINTARRAY32(nnz); 1821 mat->column_indices->assign(a->j, a->j+nnz); 1822 1823 mat->values = new THRUSTARRAY(nnz); 1824 if (a->a) mat->values->assign(a->a, a->a+nnz); 1825 1826 cusparseHybMat_t hybMat; 1827 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1828 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1829 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1830 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1831 matstruct->descr, mat->values->data().get(), 1832 mat->row_offsets->data().get(), 1833 mat->column_indices->data().get(), 1834 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1835 /* assign the pointer */ 1836 matstruct->mat = hybMat; 1837 1838 if (mat) { 1839 if (mat->values) delete (THRUSTARRAY*)mat->values; 1840 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1841 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1842 delete (CsrMatrix*)mat; 1843 } 1844 #endif 1845 } 1846 1847 /* assign the compressed row indices */ 1848 if (a->compressedrow.use) { 1849 cusparsestruct->workVector = new THRUSTARRAY(m); 1850 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1851 matstruct->cprowIndices->assign(ridx,ridx+m); 1852 tmp = m; 1853 } else { 1854 cusparsestruct->workVector = NULL; 1855 matstruct->cprowIndices = NULL; 1856 tmp = 0; 1857 } 1858 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1859 1860 /* assign the pointer */ 1861 cusparsestruct->mat = matstruct; 1862 } catch(char *ex) { 1863 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1864 } 1865 err = WaitForCUDA();CHKERRCUDA(err); 1866 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1867 cusparsestruct->nonzerostate = A->nonzerostate; 1868 } 1869 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1870 } 1871 PetscFunctionReturn(0); 1872 } 1873 1874 struct VecCUDAPlusEquals 1875 { 1876 template <typename Tuple> 1877 __host__ __device__ 1878 void operator()(Tuple t) 1879 { 1880 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1881 } 1882 }; 1883 1884 struct VecCUDAEquals 1885 { 1886 template <typename Tuple> 1887 __host__ __device__ 1888 void operator()(Tuple t) 1889 { 1890 thrust::get<1>(t) = thrust::get<0>(t); 1891 } 1892 }; 1893 1894 struct VecCUDAEqualsReverse 1895 { 1896 template <typename Tuple> 1897 __host__ __device__ 1898 void operator()(Tuple t) 1899 { 1900 thrust::get<0>(t) = thrust::get<1>(t); 1901 } 1902 }; 1903 1904 struct MatMatCusparse { 1905 PetscBool cisdense; 1906 PetscScalar *Bt; 1907 Mat X; 1908 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1909 PetscLogDouble flops; 1910 CsrMatrix *Bcsr; 1911 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1912 cusparseSpMatDescr_t matSpBDescr; 1913 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1914 cusparseDnMatDescr_t matBDescr; 1915 cusparseDnMatDescr_t matCDescr; 1916 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1917 size_t mmBufferSize; 1918 void *mmBuffer; 1919 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1920 cusparseSpGEMMDescr_t spgemmDesc; 1921 #endif 1922 }; 1923 1924 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1925 { 1926 PetscErrorCode ierr; 1927 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1928 cudaError_t cerr; 1929 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1930 cusparseStatus_t stat; 1931 #endif 1932 1933 PetscFunctionBegin; 1934 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1935 delete mmdata->Bcsr; 1936 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1937 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1938 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1939 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1940 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1941 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1942 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1943 #endif 1944 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1945 ierr = PetscFree(data);CHKERRQ(ierr); 1946 PetscFunctionReturn(0); 1947 } 1948 1949 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1950 1951 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1952 { 1953 Mat_Product *product = C->product; 1954 Mat A,B; 1955 PetscInt m,n,blda,clda; 1956 PetscBool flg,biscuda; 1957 Mat_SeqAIJCUSPARSE *cusp; 1958 cusparseStatus_t stat; 1959 cusparseOperation_t opA; 1960 const PetscScalar *barray; 1961 PetscScalar *carray; 1962 PetscErrorCode ierr; 1963 MatMatCusparse *mmdata; 1964 Mat_SeqAIJCUSPARSEMultStruct *mat; 1965 CsrMatrix *csrmat; 1966 1967 PetscFunctionBegin; 1968 MatCheckProduct(C,1); 1969 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 1970 mmdata = (MatMatCusparse*)product->data; 1971 A = product->A; 1972 B = product->B; 1973 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1974 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 1975 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1976 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1977 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1978 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1979 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1980 switch (product->type) { 1981 case MATPRODUCT_AB: 1982 case MATPRODUCT_PtAP: 1983 mat = cusp->mat; 1984 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1985 m = A->rmap->n; 1986 n = B->cmap->n; 1987 break; 1988 case MATPRODUCT_AtB: 1989 if (!A->form_explicit_transpose) { 1990 mat = cusp->mat; 1991 opA = CUSPARSE_OPERATION_TRANSPOSE; 1992 } else { 1993 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 1994 mat = cusp->matTranspose; 1995 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1996 } 1997 m = A->cmap->n; 1998 n = B->cmap->n; 1999 break; 2000 case MATPRODUCT_ABt: 2001 case MATPRODUCT_RARt: 2002 mat = cusp->mat; 2003 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2004 m = A->rmap->n; 2005 n = B->rmap->n; 2006 break; 2007 default: 2008 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2009 } 2010 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2011 csrmat = (CsrMatrix*)mat->mat; 2012 /* if the user passed a CPU matrix, copy the data to the GPU */ 2013 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2014 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2015 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2016 2017 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2018 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2019 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2020 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2021 } else { 2022 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2023 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2024 } 2025 2026 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2027 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2028 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2029 /* (re)allcoate mmBuffer if not initialized or LDAs are different */ 2030 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2031 size_t mmBufferSize; 2032 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2033 if (!mmdata->matBDescr) { 2034 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2035 mmdata->Blda = blda; 2036 } 2037 2038 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2039 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2040 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2041 mmdata->Clda = clda; 2042 } 2043 2044 if (!mat->matDescr) { 2045 stat = cusparseCreateCsr(&mat->matDescr, 2046 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2047 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2048 csrmat->values->data().get(), 2049 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2050 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2051 } 2052 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2053 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2054 mmdata->matCDescr,cusparse_scalartype, 2055 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2056 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2057 cudaError_t cerr; 2058 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2059 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2060 mmdata->mmBufferSize = mmBufferSize; 2061 } 2062 mmdata->initialized = PETSC_TRUE; 2063 } else { 2064 /* to be safe, always update pointers of the mats */ 2065 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2066 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2067 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2068 } 2069 2070 /* do cusparseSpMM, which supports transpose on B */ 2071 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2072 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2073 mmdata->matCDescr,cusparse_scalartype, 2074 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2075 #else 2076 PetscInt k; 2077 /* cusparseXcsrmm does not support transpose on B */ 2078 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2079 cublasHandle_t cublasv2handle; 2080 cublasStatus_t cerr; 2081 2082 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2083 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2084 B->cmap->n,B->rmap->n, 2085 &PETSC_CUSPARSE_ONE ,barray,blda, 2086 &PETSC_CUSPARSE_ZERO,barray,blda, 2087 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2088 blda = B->cmap->n; 2089 k = B->cmap->n; 2090 } else { 2091 k = B->rmap->n; 2092 } 2093 2094 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2095 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2096 csrmat->num_entries,mat->alpha_one,mat->descr, 2097 csrmat->values->data().get(), 2098 csrmat->row_offsets->data().get(), 2099 csrmat->column_indices->data().get(), 2100 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2101 carray,clda);CHKERRCUSPARSE(stat); 2102 #endif 2103 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2104 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2105 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2106 if (product->type == MATPRODUCT_RARt) { 2107 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2108 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2109 } else if (product->type == MATPRODUCT_PtAP) { 2110 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2111 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2112 } else { 2113 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2114 } 2115 if (mmdata->cisdense) { 2116 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2117 } 2118 if (!biscuda) { 2119 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2120 } 2121 PetscFunctionReturn(0); 2122 } 2123 2124 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2125 { 2126 Mat_Product *product = C->product; 2127 Mat A,B; 2128 PetscInt m,n; 2129 PetscBool cisdense,flg; 2130 PetscErrorCode ierr; 2131 MatMatCusparse *mmdata; 2132 Mat_SeqAIJCUSPARSE *cusp; 2133 2134 PetscFunctionBegin; 2135 MatCheckProduct(C,1); 2136 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2137 A = product->A; 2138 B = product->B; 2139 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2140 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2141 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2142 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2143 switch (product->type) { 2144 case MATPRODUCT_AB: 2145 m = A->rmap->n; 2146 n = B->cmap->n; 2147 break; 2148 case MATPRODUCT_AtB: 2149 m = A->cmap->n; 2150 n = B->cmap->n; 2151 break; 2152 case MATPRODUCT_ABt: 2153 m = A->rmap->n; 2154 n = B->rmap->n; 2155 break; 2156 case MATPRODUCT_PtAP: 2157 m = B->cmap->n; 2158 n = B->cmap->n; 2159 break; 2160 case MATPRODUCT_RARt: 2161 m = B->rmap->n; 2162 n = B->rmap->n; 2163 break; 2164 default: 2165 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2166 } 2167 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2168 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2169 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2170 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2171 2172 /* product data */ 2173 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2174 mmdata->cisdense = cisdense; 2175 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2176 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2177 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2178 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2179 } 2180 #endif 2181 /* for these products we need intermediate storage */ 2182 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2183 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2184 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2185 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2186 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2187 } else { 2188 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2189 } 2190 } 2191 C->product->data = mmdata; 2192 C->product->destroy = MatDestroy_MatMatCusparse; 2193 2194 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2195 PetscFunctionReturn(0); 2196 } 2197 2198 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2199 { 2200 Mat_Product *product = C->product; 2201 Mat A,B; 2202 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2203 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2204 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2205 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2206 PetscBool flg; 2207 PetscErrorCode ierr; 2208 cusparseStatus_t stat; 2209 cudaError_t cerr; 2210 MatProductType ptype; 2211 MatMatCusparse *mmdata; 2212 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2213 cusparseSpMatDescr_t BmatSpDescr; 2214 #endif 2215 2216 PetscFunctionBegin; 2217 MatCheckProduct(C,1); 2218 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2219 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2220 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2221 mmdata = (MatMatCusparse*)C->product->data; 2222 A = product->A; 2223 B = product->B; 2224 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2225 mmdata->reusesym = PETSC_FALSE; 2226 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2227 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2228 Cmat = Ccusp->mat; 2229 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2230 Ccsr = (CsrMatrix*)Cmat->mat; 2231 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2232 goto finalize; 2233 } 2234 if (!c->nz) goto finalize; 2235 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2236 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2237 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2238 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2239 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2240 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2241 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2242 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2243 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2244 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2245 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2246 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2247 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2248 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2249 2250 ptype = product->type; 2251 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2252 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2253 switch (ptype) { 2254 case MATPRODUCT_AB: 2255 Amat = Acusp->mat; 2256 Bmat = Bcusp->mat; 2257 break; 2258 case MATPRODUCT_AtB: 2259 Amat = Acusp->matTranspose; 2260 Bmat = Bcusp->mat; 2261 break; 2262 case MATPRODUCT_ABt: 2263 Amat = Acusp->mat; 2264 Bmat = Bcusp->matTranspose; 2265 break; 2266 default: 2267 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2268 } 2269 Cmat = Ccusp->mat; 2270 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2271 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2272 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2273 Acsr = (CsrMatrix*)Amat->mat; 2274 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2275 Ccsr = (CsrMatrix*)Cmat->mat; 2276 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2277 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2278 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2279 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2280 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2281 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2282 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2283 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2284 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2285 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2286 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2287 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2288 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2289 #else 2290 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2291 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2292 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2293 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2294 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2295 #endif 2296 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2297 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2298 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2299 C->offloadmask = PETSC_OFFLOAD_GPU; 2300 finalize: 2301 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2302 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2303 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2304 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2305 c->reallocs = 0; 2306 C->info.mallocs += 0; 2307 C->info.nz_unneeded = 0; 2308 C->assembled = C->was_assembled = PETSC_TRUE; 2309 C->num_ass++; 2310 PetscFunctionReturn(0); 2311 } 2312 2313 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2314 { 2315 Mat_Product *product = C->product; 2316 Mat A,B; 2317 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2318 Mat_SeqAIJ *a,*b,*c; 2319 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2320 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2321 PetscInt i,j,m,n,k; 2322 PetscBool flg; 2323 PetscErrorCode ierr; 2324 cusparseStatus_t stat; 2325 cudaError_t cerr; 2326 MatProductType ptype; 2327 MatMatCusparse *mmdata; 2328 PetscLogDouble flops; 2329 PetscBool biscompressed,ciscompressed; 2330 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2331 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2332 size_t bufSize2; 2333 cusparseSpMatDescr_t BmatSpDescr; 2334 #else 2335 int cnz; 2336 #endif 2337 2338 PetscFunctionBegin; 2339 MatCheckProduct(C,1); 2340 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2341 A = product->A; 2342 B = product->B; 2343 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2344 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2345 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2346 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2347 a = (Mat_SeqAIJ*)A->data; 2348 b = (Mat_SeqAIJ*)B->data; 2349 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2350 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2351 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2352 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2353 2354 /* product data */ 2355 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2356 C->product->data = mmdata; 2357 C->product->destroy = MatDestroy_MatMatCusparse; 2358 2359 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2360 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2361 ptype = product->type; 2362 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2363 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2364 biscompressed = PETSC_FALSE; 2365 ciscompressed = PETSC_FALSE; 2366 switch (ptype) { 2367 case MATPRODUCT_AB: 2368 m = A->rmap->n; 2369 n = B->cmap->n; 2370 k = A->cmap->n; 2371 Amat = Acusp->mat; 2372 Bmat = Bcusp->mat; 2373 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2374 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2375 break; 2376 case MATPRODUCT_AtB: 2377 m = A->cmap->n; 2378 n = B->cmap->n; 2379 k = A->rmap->n; 2380 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2381 Amat = Acusp->matTranspose; 2382 Bmat = Bcusp->mat; 2383 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2384 break; 2385 case MATPRODUCT_ABt: 2386 m = A->rmap->n; 2387 n = B->rmap->n; 2388 k = A->cmap->n; 2389 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 2390 Amat = Acusp->mat; 2391 Bmat = Bcusp->matTranspose; 2392 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2393 break; 2394 default: 2395 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2396 } 2397 2398 /* create cusparse matrix */ 2399 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2400 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2401 c = (Mat_SeqAIJ*)C->data; 2402 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2403 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2404 Ccsr = new CsrMatrix; 2405 2406 c->compressedrow.use = ciscompressed; 2407 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2408 c->compressedrow.nrows = a->compressedrow.nrows; 2409 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2410 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2411 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2412 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2413 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2414 } else { 2415 c->compressedrow.nrows = 0; 2416 c->compressedrow.i = NULL; 2417 c->compressedrow.rindex = NULL; 2418 Ccusp->workVector = NULL; 2419 Cmat->cprowIndices = NULL; 2420 } 2421 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2422 Ccusp->mat = Cmat; 2423 Ccusp->mat->mat = Ccsr; 2424 Ccsr->num_rows = Ccusp->nrows; 2425 Ccsr->num_cols = n; 2426 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2427 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2428 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2429 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2430 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2431 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2432 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2433 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2434 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2435 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2436 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2437 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2438 c->nz = 0; 2439 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2440 Ccsr->values = new THRUSTARRAY(c->nz); 2441 goto finalizesym; 2442 } 2443 2444 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2445 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2446 Acsr = (CsrMatrix*)Amat->mat; 2447 if (!biscompressed) { 2448 Bcsr = (CsrMatrix*)Bmat->mat; 2449 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2450 BmatSpDescr = Bmat->matDescr; 2451 #endif 2452 } else { /* we need to use row offsets for the full matrix */ 2453 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2454 Bcsr = new CsrMatrix; 2455 Bcsr->num_rows = B->rmap->n; 2456 Bcsr->num_cols = cBcsr->num_cols; 2457 Bcsr->num_entries = cBcsr->num_entries; 2458 Bcsr->column_indices = cBcsr->column_indices; 2459 Bcsr->values = cBcsr->values; 2460 if (!Bcusp->rowoffsets_gpu) { 2461 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2462 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2463 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2464 } 2465 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2466 mmdata->Bcsr = Bcsr; 2467 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2468 if (Bcsr->num_rows && Bcsr->num_cols) { 2469 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2470 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2471 Bcsr->values->data().get(), 2472 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2473 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2474 } 2475 BmatSpDescr = mmdata->matSpBDescr; 2476 #endif 2477 } 2478 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2479 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2480 /* precompute flops count */ 2481 if (ptype == MATPRODUCT_AB) { 2482 for (i=0, flops = 0; i<A->rmap->n; i++) { 2483 const PetscInt st = a->i[i]; 2484 const PetscInt en = a->i[i+1]; 2485 for (j=st; j<en; j++) { 2486 const PetscInt brow = a->j[j]; 2487 flops += 2.*(b->i[brow+1] - b->i[brow]); 2488 } 2489 } 2490 } else if (ptype == MATPRODUCT_AtB) { 2491 for (i=0, flops = 0; i<A->rmap->n; i++) { 2492 const PetscInt anzi = a->i[i+1] - a->i[i]; 2493 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2494 flops += (2.*anzi)*bnzi; 2495 } 2496 } else { /* TODO */ 2497 flops = 0.; 2498 } 2499 2500 mmdata->flops = flops; 2501 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2502 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2503 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2504 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2505 NULL, NULL, NULL, 2506 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2507 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2508 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2509 /* ask bufferSize bytes for external memory */ 2510 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2511 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2512 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2513 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2514 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2515 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2516 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2517 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2518 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2519 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2520 /* ask bufferSize again bytes for external memory */ 2521 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2522 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2523 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2524 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2525 /* The CUSPARSE documentation is not clear, nor the API 2526 We need both buffers to perform the operations properly! 2527 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2528 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2529 is stored in the descriptor! What a messy API... */ 2530 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2531 /* compute the intermediate product of A * B */ 2532 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2533 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2534 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2535 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2536 /* get matrix C non-zero entries C_nnz1 */ 2537 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2538 c->nz = (PetscInt) C_nnz1; 2539 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2540 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2541 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2542 Ccsr->values = new THRUSTARRAY(c->nz); 2543 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2544 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2545 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2546 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2547 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2548 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2549 #else 2550 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2551 stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2552 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2553 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2554 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2555 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2556 c->nz = cnz; 2557 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2558 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2559 Ccsr->values = new THRUSTARRAY(c->nz); 2560 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2561 2562 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2563 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2564 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2565 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2566 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2567 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2568 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2569 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2570 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2571 #endif 2572 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2573 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2574 finalizesym: 2575 c->singlemalloc = PETSC_FALSE; 2576 c->free_a = PETSC_TRUE; 2577 c->free_ij = PETSC_TRUE; 2578 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2579 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2580 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2581 PetscInt *d_i = c->i; 2582 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2583 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2584 ii = *Ccsr->row_offsets; 2585 jj = *Ccsr->column_indices; 2586 if (ciscompressed) d_i = c->compressedrow.i; 2587 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2588 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2589 } else { 2590 PetscInt *d_i = c->i; 2591 if (ciscompressed) d_i = c->compressedrow.i; 2592 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2593 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2594 } 2595 if (ciscompressed) { /* need to expand host row offsets */ 2596 PetscInt r = 0; 2597 c->i[0] = 0; 2598 for (k = 0; k < c->compressedrow.nrows; k++) { 2599 const PetscInt next = c->compressedrow.rindex[k]; 2600 const PetscInt old = c->compressedrow.i[k]; 2601 for (; r < next; r++) c->i[r+1] = old; 2602 } 2603 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2604 } 2605 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2606 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2607 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2608 c->maxnz = c->nz; 2609 c->nonzerorowcnt = 0; 2610 c->rmax = 0; 2611 for (k = 0; k < m; k++) { 2612 const PetscInt nn = c->i[k+1] - c->i[k]; 2613 c->ilen[k] = c->imax[k] = nn; 2614 c->nonzerorowcnt += (PetscInt)!!nn; 2615 c->rmax = PetscMax(c->rmax,nn); 2616 } 2617 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2618 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2619 Ccsr->num_entries = c->nz; 2620 2621 C->nonzerostate++; 2622 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2623 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2624 Ccusp->nonzerostate = C->nonzerostate; 2625 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2626 C->preallocated = PETSC_TRUE; 2627 C->assembled = PETSC_FALSE; 2628 C->was_assembled = PETSC_FALSE; 2629 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2630 mmdata->reusesym = PETSC_TRUE; 2631 C->offloadmask = PETSC_OFFLOAD_GPU; 2632 } 2633 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2634 PetscFunctionReturn(0); 2635 } 2636 2637 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2638 2639 /* handles sparse or dense B */ 2640 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2641 { 2642 Mat_Product *product = mat->product; 2643 PetscErrorCode ierr; 2644 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2645 2646 PetscFunctionBegin; 2647 MatCheckProduct(mat,1); 2648 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2649 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2650 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2651 } 2652 if (product->type == MATPRODUCT_ABC) { 2653 Ciscusp = PETSC_FALSE; 2654 if (!product->C->boundtocpu) { 2655 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2656 } 2657 } 2658 if (isdense) { 2659 switch (product->type) { 2660 case MATPRODUCT_AB: 2661 case MATPRODUCT_AtB: 2662 case MATPRODUCT_ABt: 2663 case MATPRODUCT_PtAP: 2664 case MATPRODUCT_RARt: 2665 if (product->A->boundtocpu) { 2666 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2667 } else { 2668 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2669 } 2670 break; 2671 case MATPRODUCT_ABC: 2672 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2673 break; 2674 default: 2675 break; 2676 } 2677 } else if (Biscusp && Ciscusp) { 2678 switch (product->type) { 2679 case MATPRODUCT_AB: 2680 case MATPRODUCT_AtB: 2681 case MATPRODUCT_ABt: 2682 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2683 break; 2684 case MATPRODUCT_PtAP: 2685 case MATPRODUCT_RARt: 2686 case MATPRODUCT_ABC: 2687 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2688 break; 2689 default: 2690 break; 2691 } 2692 } else { /* fallback for AIJ */ 2693 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2694 } 2695 PetscFunctionReturn(0); 2696 } 2697 2698 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2699 { 2700 PetscErrorCode ierr; 2701 2702 PetscFunctionBegin; 2703 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2704 PetscFunctionReturn(0); 2705 } 2706 2707 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2708 { 2709 PetscErrorCode ierr; 2710 2711 PetscFunctionBegin; 2712 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2713 PetscFunctionReturn(0); 2714 } 2715 2716 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2717 { 2718 PetscErrorCode ierr; 2719 2720 PetscFunctionBegin; 2721 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2722 PetscFunctionReturn(0); 2723 } 2724 2725 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2726 { 2727 PetscErrorCode ierr; 2728 2729 PetscFunctionBegin; 2730 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2731 PetscFunctionReturn(0); 2732 } 2733 2734 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2735 { 2736 PetscErrorCode ierr; 2737 2738 PetscFunctionBegin; 2739 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2740 PetscFunctionReturn(0); 2741 } 2742 2743 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2744 { 2745 int i = blockIdx.x*blockDim.x + threadIdx.x; 2746 if (i < n) y[idx[i]] += x[i]; 2747 } 2748 2749 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2750 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2751 { 2752 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2753 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2754 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2755 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2756 PetscErrorCode ierr; 2757 cusparseStatus_t stat; 2758 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2759 PetscBool compressed; 2760 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2761 PetscInt nx,ny; 2762 #endif 2763 2764 PetscFunctionBegin; 2765 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 2766 if (!a->nonzerorowcnt) { 2767 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2768 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2769 PetscFunctionReturn(0); 2770 } 2771 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2772 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2773 if (!trans) { 2774 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2775 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2776 } else { 2777 if (herm || !A->form_explicit_transpose) { 2778 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2779 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2780 } else { 2781 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);} 2782 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2783 } 2784 } 2785 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2786 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2787 2788 try { 2789 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2790 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2791 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2792 2793 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2794 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2795 /* z = A x + beta y. 2796 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2797 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2798 */ 2799 xptr = xarray; 2800 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2801 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2802 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2803 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2804 allocated to accommodate different uses. So we get the length info directly from mat. 2805 */ 2806 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2807 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2808 nx = mat->num_cols; 2809 ny = mat->num_rows; 2810 } 2811 #endif 2812 } else { 2813 /* z = A^T x + beta y 2814 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2815 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2816 */ 2817 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2818 dptr = zarray; 2819 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2820 if (compressed) { /* Scatter x to work vector */ 2821 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2822 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2823 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2824 VecCUDAEqualsReverse()); 2825 } 2826 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2827 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2828 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2829 nx = mat->num_rows; 2830 ny = mat->num_cols; 2831 } 2832 #endif 2833 } 2834 2835 /* csr_spmv does y = alpha op(A) x + beta y */ 2836 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2837 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2838 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2839 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2840 cudaError_t cerr; 2841 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2842 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2843 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2844 matstruct->matDescr, 2845 matstruct->cuSpMV[opA].vecXDescr, beta, 2846 matstruct->cuSpMV[opA].vecYDescr, 2847 cusparse_scalartype, 2848 cusparsestruct->spmvAlg, 2849 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2850 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2851 2852 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2853 } else { 2854 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2855 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2856 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2857 } 2858 2859 stat = cusparseSpMV(cusparsestruct->handle, opA, 2860 matstruct->alpha_one, 2861 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */ 2862 matstruct->cuSpMV[opA].vecXDescr, 2863 beta, 2864 matstruct->cuSpMV[opA].vecYDescr, 2865 cusparse_scalartype, 2866 cusparsestruct->spmvAlg, 2867 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2868 #else 2869 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2870 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2871 mat->num_rows, mat->num_cols, 2872 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2873 mat->values->data().get(), mat->row_offsets->data().get(), 2874 mat->column_indices->data().get(), xptr, beta, 2875 dptr);CHKERRCUSPARSE(stat); 2876 #endif 2877 } else { 2878 if (cusparsestruct->nrows) { 2879 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2880 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2881 #else 2882 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2883 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2884 matstruct->alpha_one, matstruct->descr, hybMat, 2885 xptr, beta, 2886 dptr);CHKERRCUSPARSE(stat); 2887 #endif 2888 } 2889 } 2890 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2891 2892 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2893 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2894 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2895 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2896 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2897 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2898 } 2899 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2900 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2901 } 2902 2903 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2904 if (compressed) { 2905 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2906 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 2907 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 2908 prevent that. So I just add a ScatterAdd kernel. 2909 */ 2910 #if 0 2911 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2912 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 2913 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2914 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2915 VecCUDAPlusEquals()); 2916 #else 2917 PetscInt n = matstruct->cprowIndices->size(); 2918 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 2919 #endif 2920 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2921 } 2922 } else { 2923 if (yy && yy != zz) { 2924 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2925 } 2926 } 2927 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2928 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2929 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2930 } catch(char *ex) { 2931 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2932 } 2933 if (yy) { 2934 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2935 } else { 2936 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2937 } 2938 PetscFunctionReturn(0); 2939 } 2940 2941 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2942 { 2943 PetscErrorCode ierr; 2944 2945 PetscFunctionBegin; 2946 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2947 PetscFunctionReturn(0); 2948 } 2949 2950 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2951 { 2952 PetscErrorCode ierr; 2953 PetscSplitCSRDataStructure *d_mat = NULL; 2954 PetscFunctionBegin; 2955 if (A->factortype == MAT_FACTOR_NONE) { 2956 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2957 } 2958 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2959 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2960 if (d_mat) { 2961 A->offloadmask = PETSC_OFFLOAD_GPU; 2962 } 2963 2964 PetscFunctionReturn(0); 2965 } 2966 2967 /* --------------------------------------------------------------------------------*/ 2968 /*@ 2969 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2970 (the default parallel PETSc format). This matrix will ultimately pushed down 2971 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2972 assembly performance the user should preallocate the matrix storage by setting 2973 the parameter nz (or the array nnz). By setting these parameters accurately, 2974 performance during matrix assembly can be increased by more than a factor of 50. 2975 2976 Collective 2977 2978 Input Parameters: 2979 + comm - MPI communicator, set to PETSC_COMM_SELF 2980 . m - number of rows 2981 . n - number of columns 2982 . nz - number of nonzeros per row (same for all rows) 2983 - nnz - array containing the number of nonzeros in the various rows 2984 (possibly different for each row) or NULL 2985 2986 Output Parameter: 2987 . A - the matrix 2988 2989 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2990 MatXXXXSetPreallocation() paradgm instead of this routine directly. 2991 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 2992 2993 Notes: 2994 If nnz is given then nz is ignored 2995 2996 The AIJ format (also called the Yale sparse matrix format or 2997 compressed row storage), is fully compatible with standard Fortran 77 2998 storage. That is, the stored row and column indices can begin at 2999 either one (as in Fortran) or zero. See the users' manual for details. 3000 3001 Specify the preallocated storage with either nz or nnz (not both). 3002 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3003 allocation. For large problems you MUST preallocate memory or you 3004 will get TERRIBLE performance, see the users' manual chapter on matrices. 3005 3006 By default, this format uses inodes (identical nodes) when possible, to 3007 improve numerical efficiency of matrix-vector products and solves. We 3008 search for consecutive rows with the same nonzero structure, thereby 3009 reusing matrix information to achieve increased efficiency. 3010 3011 Level: intermediate 3012 3013 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3014 @*/ 3015 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3016 { 3017 PetscErrorCode ierr; 3018 3019 PetscFunctionBegin; 3020 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3021 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3022 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3023 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3024 PetscFunctionReturn(0); 3025 } 3026 3027 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3028 { 3029 PetscErrorCode ierr; 3030 PetscSplitCSRDataStructure *d_mat = NULL; 3031 3032 PetscFunctionBegin; 3033 if (A->factortype == MAT_FACTOR_NONE) { 3034 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 3035 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 3036 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3037 } else { 3038 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3039 } 3040 if (d_mat) { 3041 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3042 cudaError_t err; 3043 PetscSplitCSRDataStructure h_mat; 3044 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 3045 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 3046 if (a->compressedrow.use) { 3047 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 3048 } 3049 err = cudaFree(d_mat);CHKERRCUDA(err); 3050 } 3051 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3052 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3053 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3054 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3055 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3056 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3057 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3058 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3059 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3060 PetscFunctionReturn(0); 3061 } 3062 3063 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3064 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3065 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3066 { 3067 PetscErrorCode ierr; 3068 3069 PetscFunctionBegin; 3070 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3071 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3072 PetscFunctionReturn(0); 3073 } 3074 3075 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3076 { 3077 PetscErrorCode ierr; 3078 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3079 Mat_SeqAIJCUSPARSE *cy; 3080 Mat_SeqAIJCUSPARSE *cx; 3081 PetscScalar *ay; 3082 const PetscScalar *ax; 3083 CsrMatrix *csry,*csrx; 3084 3085 PetscFunctionBegin; 3086 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3087 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3088 if (X->ops->axpy != Y->ops->axpy) { 3089 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3090 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3091 PetscFunctionReturn(0); 3092 } 3093 /* if we are here, it means both matrices are bound to GPU */ 3094 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3095 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3096 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3097 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3098 csry = (CsrMatrix*)cy->mat->mat; 3099 csrx = (CsrMatrix*)cx->mat->mat; 3100 /* see if we can turn this into a cublas axpy */ 3101 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3102 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3103 if (eq) { 3104 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3105 } 3106 if (eq) str = SAME_NONZERO_PATTERN; 3107 } 3108 /* spgeam is buggy with one column */ 3109 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3110 3111 if (str == SUBSET_NONZERO_PATTERN) { 3112 cusparseStatus_t stat; 3113 PetscScalar b = 1.0; 3114 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3115 size_t bufferSize; 3116 void *buffer; 3117 cudaError_t cerr; 3118 #endif 3119 3120 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3121 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3122 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3123 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3124 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3125 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3126 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3127 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3128 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3129 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3130 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3131 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3132 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3133 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3134 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3135 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3136 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3137 #else 3138 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3139 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3140 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3141 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3142 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3143 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3144 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3145 #endif 3146 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3147 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3148 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3149 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3150 } else if (str == SAME_NONZERO_PATTERN) { 3151 cublasHandle_t cublasv2handle; 3152 cublasStatus_t berr; 3153 PetscBLASInt one = 1, bnz = 1; 3154 3155 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3156 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3157 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3158 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3159 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3160 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3161 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3162 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3163 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3164 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3165 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3166 } else { 3167 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3168 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3169 } 3170 PetscFunctionReturn(0); 3171 } 3172 3173 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3174 { 3175 PetscErrorCode ierr; 3176 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3177 PetscScalar *ay; 3178 cublasHandle_t cublasv2handle; 3179 cublasStatus_t berr; 3180 PetscBLASInt one = 1, bnz = 1; 3181 3182 PetscFunctionBegin; 3183 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3184 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3185 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3186 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3187 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3188 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3189 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3190 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3191 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3192 PetscFunctionReturn(0); 3193 } 3194 3195 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3196 { 3197 PetscErrorCode ierr; 3198 PetscBool both = PETSC_FALSE; 3199 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3200 3201 PetscFunctionBegin; 3202 if (A->factortype == MAT_FACTOR_NONE) { 3203 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3204 if (spptr->mat) { 3205 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3206 if (matrix->values) { 3207 both = PETSC_TRUE; 3208 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3209 } 3210 } 3211 if (spptr->matTranspose) { 3212 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3213 if (matrix->values) { 3214 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3215 } 3216 } 3217 } 3218 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3219 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3220 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3221 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3222 else A->offloadmask = PETSC_OFFLOAD_CPU; 3223 3224 PetscFunctionReturn(0); 3225 } 3226 3227 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3228 { 3229 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3230 PetscErrorCode ierr; 3231 3232 PetscFunctionBegin; 3233 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3234 if (flg) { 3235 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3236 3237 A->ops->scale = MatScale_SeqAIJ; 3238 A->ops->axpy = MatAXPY_SeqAIJ; 3239 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3240 A->ops->mult = MatMult_SeqAIJ; 3241 A->ops->multadd = MatMultAdd_SeqAIJ; 3242 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3243 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3244 A->ops->multhermitiantranspose = NULL; 3245 A->ops->multhermitiantransposeadd = NULL; 3246 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3247 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3248 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3249 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3250 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3251 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3252 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3253 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3254 } else { 3255 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3256 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3257 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3258 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3259 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3260 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3261 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3262 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3263 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3264 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3265 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3266 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3267 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3268 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3269 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3270 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3271 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3272 } 3273 A->boundtocpu = flg; 3274 a->inode.use = flg; 3275 PetscFunctionReturn(0); 3276 } 3277 3278 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3279 { 3280 PetscErrorCode ierr; 3281 cusparseStatus_t stat; 3282 Mat B; 3283 3284 PetscFunctionBegin; 3285 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3286 if (reuse == MAT_INITIAL_MATRIX) { 3287 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3288 } else if (reuse == MAT_REUSE_MATRIX) { 3289 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3290 } 3291 B = *newmat; 3292 3293 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3294 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3295 3296 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3297 if (B->factortype == MAT_FACTOR_NONE) { 3298 Mat_SeqAIJCUSPARSE *spptr; 3299 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3300 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3301 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3302 spptr->format = MAT_CUSPARSE_CSR; 3303 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3304 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3305 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3306 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3307 #endif 3308 B->spptr = spptr; 3309 } else { 3310 Mat_SeqAIJCUSPARSETriFactors *spptr; 3311 3312 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3313 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3314 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3315 B->spptr = spptr; 3316 } 3317 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3318 } 3319 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3320 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3321 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3322 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3323 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3324 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3325 3326 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3327 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3328 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3329 PetscFunctionReturn(0); 3330 } 3331 3332 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3333 { 3334 PetscErrorCode ierr; 3335 3336 PetscFunctionBegin; 3337 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3338 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3339 PetscFunctionReturn(0); 3340 } 3341 3342 /*MC 3343 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3344 3345 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3346 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3347 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3348 3349 Options Database Keys: 3350 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3351 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3352 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3353 3354 Level: beginner 3355 3356 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3357 M*/ 3358 3359 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3360 3361 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3362 { 3363 PetscErrorCode ierr; 3364 3365 PetscFunctionBegin; 3366 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 3367 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3368 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3369 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3370 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3371 3372 PetscFunctionReturn(0); 3373 } 3374 3375 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3376 { 3377 PetscErrorCode ierr; 3378 cusparseStatus_t stat; 3379 3380 PetscFunctionBegin; 3381 if (*cusparsestruct) { 3382 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3383 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3384 delete (*cusparsestruct)->workVector; 3385 delete (*cusparsestruct)->rowoffsets_gpu; 3386 delete (*cusparsestruct)->cooPerm; 3387 delete (*cusparsestruct)->cooPerm_a; 3388 delete (*cusparsestruct)->csr2csc_i; 3389 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3390 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3391 } 3392 PetscFunctionReturn(0); 3393 } 3394 3395 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3396 { 3397 PetscFunctionBegin; 3398 if (*mat) { 3399 delete (*mat)->values; 3400 delete (*mat)->column_indices; 3401 delete (*mat)->row_offsets; 3402 delete *mat; 3403 *mat = 0; 3404 } 3405 PetscFunctionReturn(0); 3406 } 3407 3408 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3409 { 3410 cusparseStatus_t stat; 3411 PetscErrorCode ierr; 3412 3413 PetscFunctionBegin; 3414 if (*trifactor) { 3415 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3416 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3417 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3418 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3419 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3420 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3421 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3422 #endif 3423 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3424 } 3425 PetscFunctionReturn(0); 3426 } 3427 3428 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3429 { 3430 CsrMatrix *mat; 3431 cusparseStatus_t stat; 3432 cudaError_t err; 3433 3434 PetscFunctionBegin; 3435 if (*matstruct) { 3436 if ((*matstruct)->mat) { 3437 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3438 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3439 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3440 #else 3441 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3442 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3443 #endif 3444 } else { 3445 mat = (CsrMatrix*)(*matstruct)->mat; 3446 CsrMatrix_Destroy(&mat); 3447 } 3448 } 3449 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3450 delete (*matstruct)->cprowIndices; 3451 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3452 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3453 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3454 3455 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3456 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3457 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3458 for (int i=0; i<3; i++) { 3459 if (mdata->cuSpMV[i].initialized) { 3460 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3461 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3462 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3463 } 3464 } 3465 #endif 3466 delete *matstruct; 3467 *matstruct = NULL; 3468 } 3469 PetscFunctionReturn(0); 3470 } 3471 3472 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3473 { 3474 PetscErrorCode ierr; 3475 3476 PetscFunctionBegin; 3477 if (*trifactors) { 3478 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3479 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3480 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3481 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3482 delete (*trifactors)->rpermIndices; 3483 delete (*trifactors)->cpermIndices; 3484 delete (*trifactors)->workVector; 3485 (*trifactors)->rpermIndices = NULL; 3486 (*trifactors)->cpermIndices = NULL; 3487 (*trifactors)->workVector = NULL; 3488 if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3489 if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3490 (*trifactors)->init_dev_prop = PETSC_FALSE; 3491 } 3492 PetscFunctionReturn(0); 3493 } 3494 3495 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3496 { 3497 PetscErrorCode ierr; 3498 cusparseHandle_t handle; 3499 cusparseStatus_t stat; 3500 3501 PetscFunctionBegin; 3502 if (*trifactors) { 3503 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3504 if (handle = (*trifactors)->handle) { 3505 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3506 } 3507 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3508 } 3509 PetscFunctionReturn(0); 3510 } 3511 3512 struct IJCompare 3513 { 3514 __host__ __device__ 3515 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3516 { 3517 if (t1.get<0>() < t2.get<0>()) return true; 3518 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3519 return false; 3520 } 3521 }; 3522 3523 struct IJEqual 3524 { 3525 __host__ __device__ 3526 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3527 { 3528 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3529 return true; 3530 } 3531 }; 3532 3533 struct IJDiff 3534 { 3535 __host__ __device__ 3536 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3537 { 3538 return t1 == t2 ? 0 : 1; 3539 } 3540 }; 3541 3542 struct IJSum 3543 { 3544 __host__ __device__ 3545 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3546 { 3547 return t1||t2; 3548 } 3549 }; 3550 3551 #include <thrust/iterator/discard_iterator.h> 3552 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3553 { 3554 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3555 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3556 THRUSTARRAY *cooPerm_v = NULL; 3557 thrust::device_ptr<const PetscScalar> d_v; 3558 CsrMatrix *matrix; 3559 PetscErrorCode ierr; 3560 PetscInt n; 3561 3562 PetscFunctionBegin; 3563 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3564 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3565 if (!cusp->cooPerm) { 3566 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3567 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3568 PetscFunctionReturn(0); 3569 } 3570 matrix = (CsrMatrix*)cusp->mat->mat; 3571 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3572 if (!v) { 3573 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3574 goto finalize; 3575 } 3576 n = cusp->cooPerm->size(); 3577 if (isCudaMem(v)) { 3578 d_v = thrust::device_pointer_cast(v); 3579 } else { 3580 cooPerm_v = new THRUSTARRAY(n); 3581 cooPerm_v->assign(v,v+n); 3582 d_v = cooPerm_v->data(); 3583 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3584 } 3585 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3586 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3587 if (cusp->cooPerm_a) { 3588 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3589 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3590 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3591 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3592 delete cooPerm_w; 3593 } else { 3594 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3595 matrix->values->begin())); 3596 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3597 matrix->values->end())); 3598 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 3599 } 3600 } else { 3601 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3602 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3603 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3604 } else { 3605 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3606 matrix->values->begin())); 3607 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3608 matrix->values->end())); 3609 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3610 } 3611 } 3612 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3613 finalize: 3614 delete cooPerm_v; 3615 A->offloadmask = PETSC_OFFLOAD_GPU; 3616 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3617 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3618 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3619 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3620 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3621 a->reallocs = 0; 3622 A->info.mallocs += 0; 3623 A->info.nz_unneeded = 0; 3624 A->assembled = A->was_assembled = PETSC_TRUE; 3625 A->num_ass++; 3626 PetscFunctionReturn(0); 3627 } 3628 3629 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3630 { 3631 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3632 PetscErrorCode ierr; 3633 3634 PetscFunctionBegin; 3635 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3636 if (!cusp) PetscFunctionReturn(0); 3637 if (destroy) { 3638 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3639 delete cusp->csr2csc_i; 3640 cusp->csr2csc_i = NULL; 3641 } 3642 A->transupdated = PETSC_FALSE; 3643 PetscFunctionReturn(0); 3644 } 3645 3646 #include <thrust/binary_search.h> 3647 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3648 { 3649 PetscErrorCode ierr; 3650 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3651 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3652 PetscInt cooPerm_n, nzr = 0; 3653 cudaError_t cerr; 3654 3655 PetscFunctionBegin; 3656 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3657 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3658 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3659 if (n != cooPerm_n) { 3660 delete cusp->cooPerm; 3661 delete cusp->cooPerm_a; 3662 cusp->cooPerm = NULL; 3663 cusp->cooPerm_a = NULL; 3664 } 3665 if (n) { 3666 THRUSTINTARRAY d_i(n); 3667 THRUSTINTARRAY d_j(n); 3668 THRUSTINTARRAY ii(A->rmap->n); 3669 3670 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3671 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3672 3673 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3674 d_i.assign(coo_i,coo_i+n); 3675 d_j.assign(coo_j,coo_j+n); 3676 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3677 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3678 3679 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3680 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3681 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 3682 *cusp->cooPerm_a = d_i; 3683 THRUSTINTARRAY w = d_j; 3684 3685 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 3686 if (nekey == ekey) { /* all entries are unique */ 3687 delete cusp->cooPerm_a; 3688 cusp->cooPerm_a = NULL; 3689 } else { /* I couldn't come up with a more elegant algorithm */ 3690 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 3691 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 3692 (*cusp->cooPerm_a)[0] = 0; 3693 w[0] = 0; 3694 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 3695 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 3696 } 3697 thrust::counting_iterator<PetscInt> search_begin(0); 3698 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 3699 search_begin, search_begin + A->rmap->n, 3700 ii.begin()); 3701 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3702 3703 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3704 a->singlemalloc = PETSC_FALSE; 3705 a->free_a = PETSC_TRUE; 3706 a->free_ij = PETSC_TRUE; 3707 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3708 a->i[0] = 0; 3709 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3710 a->nz = a->maxnz = a->i[A->rmap->n]; 3711 a->rmax = 0; 3712 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3713 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3714 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3715 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3716 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3717 for (PetscInt i = 0; i < A->rmap->n; i++) { 3718 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3719 nzr += (PetscInt)!!(nnzr); 3720 a->ilen[i] = a->imax[i] = nnzr; 3721 a->rmax = PetscMax(a->rmax,nnzr); 3722 } 3723 a->nonzerorowcnt = nzr; 3724 A->preallocated = PETSC_TRUE; 3725 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3726 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3727 } else { 3728 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3729 } 3730 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3731 3732 /* We want to allocate the CUSPARSE struct for matvec now. 3733 The code is so convoluted now that I prefer to copy zeros */ 3734 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3735 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3736 A->offloadmask = PETSC_OFFLOAD_CPU; 3737 A->nonzerostate++; 3738 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3739 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 3740 3741 A->assembled = PETSC_FALSE; 3742 A->was_assembled = PETSC_FALSE; 3743 PetscFunctionReturn(0); 3744 } 3745 3746 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 3747 { 3748 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3749 CsrMatrix *csr; 3750 PetscErrorCode ierr; 3751 3752 PetscFunctionBegin; 3753 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3754 PetscValidPointer(a,2); 3755 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3756 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3757 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3758 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3759 csr = (CsrMatrix*)cusp->mat->mat; 3760 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3761 *a = csr->values->data().get(); 3762 PetscFunctionReturn(0); 3763 } 3764 3765 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 3766 { 3767 PetscFunctionBegin; 3768 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3769 PetscValidPointer(a,2); 3770 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3771 *a = NULL; 3772 PetscFunctionReturn(0); 3773 } 3774 3775 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 3776 { 3777 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3778 CsrMatrix *csr; 3779 PetscErrorCode ierr; 3780 3781 PetscFunctionBegin; 3782 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3783 PetscValidPointer(a,2); 3784 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3785 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3786 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3787 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3788 csr = (CsrMatrix*)cusp->mat->mat; 3789 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3790 *a = csr->values->data().get(); 3791 A->offloadmask = PETSC_OFFLOAD_GPU; 3792 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3793 PetscFunctionReturn(0); 3794 } 3795 3796 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 3797 { 3798 PetscErrorCode ierr; 3799 3800 PetscFunctionBegin; 3801 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3802 PetscValidPointer(a,2); 3803 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3804 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3805 *a = NULL; 3806 PetscFunctionReturn(0); 3807 } 3808 3809 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 3810 { 3811 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3812 CsrMatrix *csr; 3813 PetscErrorCode ierr; 3814 3815 PetscFunctionBegin; 3816 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3817 PetscValidPointer(a,2); 3818 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3819 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3820 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3821 csr = (CsrMatrix*)cusp->mat->mat; 3822 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3823 *a = csr->values->data().get(); 3824 A->offloadmask = PETSC_OFFLOAD_GPU; 3825 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3826 PetscFunctionReturn(0); 3827 } 3828 3829 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 3830 { 3831 PetscErrorCode ierr; 3832 3833 PetscFunctionBegin; 3834 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3835 PetscValidPointer(a,2); 3836 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3837 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3838 *a = NULL; 3839 PetscFunctionReturn(0); 3840 } 3841 3842 struct IJCompare4 3843 { 3844 __host__ __device__ 3845 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 3846 { 3847 if (t1.get<0>() < t2.get<0>()) return true; 3848 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3849 return false; 3850 } 3851 }; 3852 3853 struct Shift 3854 { 3855 int _shift; 3856 3857 Shift(int shift) : _shift(shift) {} 3858 __host__ __device__ 3859 inline int operator() (const int &c) 3860 { 3861 return c + _shift; 3862 } 3863 }; 3864 3865 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */ 3866 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 3867 { 3868 PetscErrorCode ierr; 3869 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 3870 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 3871 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 3872 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3873 PetscInt Annz,Bnnz; 3874 cusparseStatus_t stat; 3875 PetscInt i,m,n,zero = 0; 3876 cudaError_t cerr; 3877 3878 PetscFunctionBegin; 3879 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3880 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 3881 PetscValidPointer(C,4); 3882 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3883 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 3884 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 3885 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 3886 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3887 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3888 if (reuse == MAT_INITIAL_MATRIX) { 3889 m = A->rmap->n; 3890 n = A->cmap->n + B->cmap->n; 3891 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 3892 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 3893 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3894 c = (Mat_SeqAIJ*)(*C)->data; 3895 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3896 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3897 Ccsr = new CsrMatrix; 3898 Cmat->cprowIndices = NULL; 3899 c->compressedrow.use = PETSC_FALSE; 3900 c->compressedrow.nrows = 0; 3901 c->compressedrow.i = NULL; 3902 c->compressedrow.rindex = NULL; 3903 Ccusp->workVector = NULL; 3904 Ccusp->nrows = m; 3905 Ccusp->mat = Cmat; 3906 Ccusp->mat->mat = Ccsr; 3907 Ccsr->num_rows = m; 3908 Ccsr->num_cols = n; 3909 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 3910 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3911 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3912 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3913 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3914 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3915 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3916 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3917 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3918 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3919 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 3920 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 3921 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 3922 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3923 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3924 3925 Acsr = (CsrMatrix*)Acusp->mat->mat; 3926 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 3927 Annz = (PetscInt)Acsr->column_indices->size(); 3928 Bnnz = (PetscInt)Bcsr->column_indices->size(); 3929 c->nz = Annz + Bnnz; 3930 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 3931 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3932 Ccsr->values = new THRUSTARRAY(c->nz); 3933 Ccsr->num_entries = c->nz; 3934 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 3935 if (c->nz) { 3936 auto Acoo = new THRUSTINTARRAY32(Annz); 3937 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 3938 auto Ccoo = new THRUSTINTARRAY32(c->nz); 3939 THRUSTINTARRAY32 *Aroff,*Broff; 3940 3941 if (a->compressedrow.use) { /* need full row offset */ 3942 if (!Acusp->rowoffsets_gpu) { 3943 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 3944 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 3945 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3946 } 3947 Aroff = Acusp->rowoffsets_gpu; 3948 } else Aroff = Acsr->row_offsets; 3949 if (b->compressedrow.use) { /* need full row offset */ 3950 if (!Bcusp->rowoffsets_gpu) { 3951 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3952 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 3953 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3954 } 3955 Broff = Bcusp->rowoffsets_gpu; 3956 } else Broff = Bcsr->row_offsets; 3957 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3958 stat = cusparseXcsr2coo(Acusp->handle, 3959 Aroff->data().get(), 3960 Annz, 3961 m, 3962 Acoo->data().get(), 3963 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3964 stat = cusparseXcsr2coo(Bcusp->handle, 3965 Broff->data().get(), 3966 Bnnz, 3967 m, 3968 Bcoo->data().get(), 3969 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3970 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 3971 auto Aperm = thrust::make_constant_iterator(1); 3972 auto Bperm = thrust::make_constant_iterator(0); 3973 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 3974 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 3975 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 3976 #else 3977 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 3978 auto Bcib = Bcsr->column_indices->begin(); 3979 auto Bcie = Bcsr->column_indices->end(); 3980 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 3981 #endif 3982 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 3983 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 3984 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 3985 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 3986 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 3987 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 3988 auto p1 = Ccusp->cooPerm->begin(); 3989 auto p2 = Ccusp->cooPerm->begin(); 3990 thrust::advance(p2,Annz); 3991 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 3992 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 3993 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 3994 #endif 3995 auto cci = thrust::make_counting_iterator(zero); 3996 auto cce = thrust::make_counting_iterator(c->nz); 3997 #if 0 //Errors on SUMMIT cuda 11.1.0 3998 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 3999 #else 4000 auto pred = thrust::identity<int>(); 4001 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4002 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4003 #endif 4004 stat = cusparseXcoo2csr(Ccusp->handle, 4005 Ccoo->data().get(), 4006 c->nz, 4007 m, 4008 Ccsr->row_offsets->data().get(), 4009 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4010 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4011 delete wPerm; 4012 delete Acoo; 4013 delete Bcoo; 4014 delete Ccoo; 4015 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4016 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4017 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4018 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4019 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4020 #endif 4021 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4022 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4023 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4024 CsrMatrix *CcsrT = new CsrMatrix; 4025 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4026 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4027 4028 (*C)->form_explicit_transpose = PETSC_TRUE; 4029 (*C)->transupdated = PETSC_TRUE; 4030 Ccusp->rowoffsets_gpu = NULL; 4031 CmatT->cprowIndices = NULL; 4032 CmatT->mat = CcsrT; 4033 CcsrT->num_rows = n; 4034 CcsrT->num_cols = m; 4035 CcsrT->num_entries = c->nz; 4036 4037 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4038 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4039 CcsrT->values = new THRUSTARRAY(c->nz); 4040 4041 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4042 auto rT = CcsrT->row_offsets->begin(); 4043 if (AT) { 4044 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4045 thrust::advance(rT,-1); 4046 } 4047 if (BT) { 4048 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4049 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4050 thrust::copy(titb,tite,rT); 4051 } 4052 auto cT = CcsrT->column_indices->begin(); 4053 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4054 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4055 auto vT = CcsrT->values->begin(); 4056 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4057 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4058 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4059 4060 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4061 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4062 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4063 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4064 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4065 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4066 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4067 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4068 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4069 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4070 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4071 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4072 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4073 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4074 #endif 4075 Ccusp->matTranspose = CmatT; 4076 } 4077 } 4078 4079 c->singlemalloc = PETSC_FALSE; 4080 c->free_a = PETSC_TRUE; 4081 c->free_ij = PETSC_TRUE; 4082 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4083 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4084 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4085 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4086 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4087 ii = *Ccsr->row_offsets; 4088 jj = *Ccsr->column_indices; 4089 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4090 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4091 } else { 4092 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4093 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4094 } 4095 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4096 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4097 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4098 c->maxnz = c->nz; 4099 c->nonzerorowcnt = 0; 4100 c->rmax = 0; 4101 for (i = 0; i < m; i++) { 4102 const PetscInt nn = c->i[i+1] - c->i[i]; 4103 c->ilen[i] = c->imax[i] = nn; 4104 c->nonzerorowcnt += (PetscInt)!!nn; 4105 c->rmax = PetscMax(c->rmax,nn); 4106 } 4107 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4108 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4109 (*C)->nonzerostate++; 4110 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4111 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4112 Ccusp->nonzerostate = (*C)->nonzerostate; 4113 (*C)->preallocated = PETSC_TRUE; 4114 } else { 4115 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4116 c = (Mat_SeqAIJ*)(*C)->data; 4117 if (c->nz) { 4118 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4119 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4120 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4121 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4122 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4123 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4124 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4125 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4126 Acsr = (CsrMatrix*)Acusp->mat->mat; 4127 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4128 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4129 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4130 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4131 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4132 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4133 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4134 auto pmid = Ccusp->cooPerm->begin(); 4135 thrust::advance(pmid,Acsr->num_entries); 4136 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4137 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4138 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4139 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4140 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4141 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4142 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4143 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4144 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4145 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4146 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4147 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4148 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4149 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4150 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4151 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4152 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4153 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4154 auto vT = CcsrT->values->begin(); 4155 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4156 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4157 (*C)->transupdated = PETSC_TRUE; 4158 } 4159 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4160 } 4161 } 4162 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4163 (*C)->assembled = PETSC_TRUE; 4164 (*C)->was_assembled = PETSC_FALSE; 4165 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4166 PetscFunctionReturn(0); 4167 } 4168 4169 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4170 { 4171 PetscErrorCode ierr; 4172 bool dmem; 4173 const PetscScalar *av; 4174 cudaError_t cerr; 4175 4176 PetscFunctionBegin; 4177 dmem = isCudaMem(v); 4178 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4179 if (n && idx) { 4180 THRUSTINTARRAY widx(n); 4181 widx.assign(idx,idx+n); 4182 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4183 4184 THRUSTARRAY *w = NULL; 4185 thrust::device_ptr<PetscScalar> dv; 4186 if (dmem) { 4187 dv = thrust::device_pointer_cast(v); 4188 } else { 4189 w = new THRUSTARRAY(n); 4190 dv = w->data(); 4191 } 4192 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4193 4194 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4195 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4196 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4197 if (w) { 4198 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4199 } 4200 delete w; 4201 } else { 4202 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4203 } 4204 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4205 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4206 PetscFunctionReturn(0); 4207 } 4208