1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 #include <thrust/async/for_each.h> 17 18 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 19 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 20 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 21 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 22 23 typedef enum { 24 CUSPARSE_MV_ALG_DEFAULT = 0, 25 CUSPARSE_COOMV_ALG = 1, 26 CUSPARSE_CSRMV_ALG1 = 2, 27 CUSPARSE_CSRMV_ALG2 = 3 28 } cusparseSpMVAlg_t; 29 30 typedef enum { 31 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 32 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 33 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 34 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 35 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 36 CUSPARSE_SPMM_ALG_DEFAULT = 0, 37 CUSPARSE_SPMM_COO_ALG1 = 1, 38 CUSPARSE_SPMM_COO_ALG2 = 2, 39 CUSPARSE_SPMM_COO_ALG3 = 3, 40 CUSPARSE_SPMM_COO_ALG4 = 5, 41 CUSPARSE_SPMM_CSR_ALG1 = 4, 42 CUSPARSE_SPMM_CSR_ALG2 = 6, 43 } cusparseSpMMAlg_t; 44 45 typedef enum { 46 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 47 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 48 } cusparseCsr2CscAlg_t; 49 */ 50 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 51 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 52 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 53 #endif 54 55 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 57 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 58 59 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 61 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 64 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 66 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 67 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 68 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 69 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 70 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 71 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 72 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 73 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 74 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 75 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 76 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 77 78 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 80 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 81 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 82 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 83 84 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 85 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 86 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 87 88 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 89 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 90 91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 92 93 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 94 { 95 cusparseStatus_t stat; 96 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 97 98 PetscFunctionBegin; 99 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 100 cusparsestruct->stream = stream; 101 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 102 PetscFunctionReturn(0); 103 } 104 105 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 106 { 107 cusparseStatus_t stat; 108 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 109 110 PetscFunctionBegin; 111 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 112 if (cusparsestruct->handle != handle) { 113 if (cusparsestruct->handle) { 114 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 115 } 116 cusparsestruct->handle = handle; 117 } 118 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 119 PetscFunctionReturn(0); 120 } 121 122 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 123 { 124 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 125 PetscBool flg; 126 PetscErrorCode ierr; 127 128 PetscFunctionBegin; 129 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 130 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 131 if (cusparsestruct->handle) cusparsestruct->handle = 0; 132 PetscFunctionReturn(0); 133 } 134 135 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 136 { 137 PetscFunctionBegin; 138 *type = MATSOLVERCUSPARSE; 139 PetscFunctionReturn(0); 140 } 141 142 /*MC 143 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 144 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 145 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 146 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 147 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 148 algorithms are not recommended. This class does NOT support direct solver operations. 149 150 Level: beginner 151 152 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 153 M*/ 154 155 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 156 { 157 PetscErrorCode ierr; 158 PetscInt n = A->rmap->n; 159 160 PetscFunctionBegin; 161 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 162 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 163 (*B)->factortype = ftype; 164 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 165 166 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 167 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 168 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 169 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 170 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 171 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 172 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 173 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 174 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 175 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 176 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 177 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 178 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 179 180 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 181 (*B)->canuseordering = PETSC_TRUE; 182 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 183 PetscFunctionReturn(0); 184 } 185 186 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 187 { 188 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 189 190 PetscFunctionBegin; 191 switch (op) { 192 case MAT_CUSPARSE_MULT: 193 cusparsestruct->format = format; 194 break; 195 case MAT_CUSPARSE_ALL: 196 cusparsestruct->format = format; 197 break; 198 default: 199 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 200 } 201 PetscFunctionReturn(0); 202 } 203 204 /*@ 205 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 206 operation. Only the MatMult operation can use different GPU storage formats 207 for MPIAIJCUSPARSE matrices. 208 Not Collective 209 210 Input Parameters: 211 + A - Matrix of type SEQAIJCUSPARSE 212 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 213 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 214 215 Output Parameter: 216 217 Level: intermediate 218 219 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 220 @*/ 221 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 222 { 223 PetscErrorCode ierr; 224 225 PetscFunctionBegin; 226 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 227 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 228 PetscFunctionReturn(0); 229 } 230 231 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 232 { 233 PetscErrorCode ierr; 234 235 PetscFunctionBegin; 236 switch (op) { 237 case MAT_FORM_EXPLICIT_TRANSPOSE: 238 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 239 if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 240 A->form_explicit_transpose = flg; 241 break; 242 default: 243 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 244 break; 245 } 246 PetscFunctionReturn(0); 247 } 248 249 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 250 251 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 252 { 253 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 254 IS isrow = b->row,iscol = b->col; 255 PetscBool row_identity,col_identity; 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 260 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 261 B->offloadmask = PETSC_OFFLOAD_CPU; 262 /* determine which version of MatSolve needs to be used. */ 263 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 264 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 265 if (row_identity && col_identity) { 266 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 267 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 268 B->ops->matsolve = NULL; 269 B->ops->matsolvetranspose = NULL; 270 } else { 271 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 272 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 273 B->ops->matsolve = NULL; 274 B->ops->matsolvetranspose = NULL; 275 } 276 277 /* get the triangular factors */ 278 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 279 PetscFunctionReturn(0); 280 } 281 282 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 283 { 284 PetscErrorCode ierr; 285 MatCUSPARSEStorageFormat format; 286 PetscBool flg; 287 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 288 289 PetscFunctionBegin; 290 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 291 if (A->factortype == MAT_FACTOR_NONE) { 292 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 293 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 294 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 295 296 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 297 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 298 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 299 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 300 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 301 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 302 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 303 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 304 305 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 306 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 307 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 308 309 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 310 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 311 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 312 #endif 313 } 314 ierr = PetscOptionsTail();CHKERRQ(ierr); 315 PetscFunctionReturn(0); 316 } 317 318 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 319 { 320 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 321 PetscErrorCode ierr; 322 323 PetscFunctionBegin; 324 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 325 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 326 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 327 PetscFunctionReturn(0); 328 } 329 330 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 331 { 332 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 333 PetscErrorCode ierr; 334 335 PetscFunctionBegin; 336 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 337 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 338 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 339 PetscFunctionReturn(0); 340 } 341 342 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 343 { 344 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 345 PetscErrorCode ierr; 346 347 PetscFunctionBegin; 348 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 349 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 350 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 351 PetscFunctionReturn(0); 352 } 353 354 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 355 { 356 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 357 PetscErrorCode ierr; 358 359 PetscFunctionBegin; 360 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 361 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 362 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 363 PetscFunctionReturn(0); 364 } 365 366 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 367 { 368 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 369 PetscInt n = A->rmap->n; 370 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 371 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 372 cusparseStatus_t stat; 373 const PetscInt *ai = a->i,*aj = a->j,*vi; 374 const MatScalar *aa = a->a,*v; 375 PetscInt *AiLo, *AjLo; 376 PetscInt i,nz, nzLower, offset, rowOffset; 377 PetscErrorCode ierr; 378 cudaError_t cerr; 379 380 PetscFunctionBegin; 381 if (!n) PetscFunctionReturn(0); 382 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 383 try { 384 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 385 nzLower=n+ai[n]-ai[1]; 386 if (!loTriFactor) { 387 PetscScalar *AALo; 388 389 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 390 391 /* Allocate Space for the lower triangular matrix */ 392 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 393 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 394 395 /* Fill the lower triangular matrix */ 396 AiLo[0] = (PetscInt) 0; 397 AiLo[n] = nzLower; 398 AjLo[0] = (PetscInt) 0; 399 AALo[0] = (MatScalar) 1.0; 400 v = aa; 401 vi = aj; 402 offset = 1; 403 rowOffset= 1; 404 for (i=1; i<n; i++) { 405 nz = ai[i+1] - ai[i]; 406 /* additional 1 for the term on the diagonal */ 407 AiLo[i] = rowOffset; 408 rowOffset += nz+1; 409 410 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 411 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 412 413 offset += nz; 414 AjLo[offset] = (PetscInt) i; 415 AALo[offset] = (MatScalar) 1.0; 416 offset += 1; 417 418 v += nz; 419 vi += nz; 420 } 421 422 /* allocate space for the triangular factor information */ 423 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 424 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 425 /* Create the matrix description */ 426 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 427 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 428 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 429 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 430 #else 431 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 432 #endif 433 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 434 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 435 436 /* set the operation */ 437 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 438 439 /* set the matrix */ 440 loTriFactor->csrMat = new CsrMatrix; 441 loTriFactor->csrMat->num_rows = n; 442 loTriFactor->csrMat->num_cols = n; 443 loTriFactor->csrMat->num_entries = nzLower; 444 445 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 446 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 447 448 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 449 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 450 451 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 452 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 453 454 /* Create the solve analysis information */ 455 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 456 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 457 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 458 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 459 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 460 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 461 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 462 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 463 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 464 #endif 465 466 /* perform the solve analysis */ 467 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 468 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 469 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 470 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 471 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 472 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 473 #endif 474 );CHKERRCUSPARSE(stat); 475 cerr = WaitForCUDA();CHKERRCUDA(cerr); 476 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 477 478 /* assign the pointer */ 479 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 480 loTriFactor->AA_h = AALo; 481 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 482 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 483 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 484 } else { /* update values only */ 485 if (!loTriFactor->AA_h) { 486 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 487 } 488 /* Fill the lower triangular matrix */ 489 loTriFactor->AA_h[0] = 1.0; 490 v = aa; 491 vi = aj; 492 offset = 1; 493 for (i=1; i<n; i++) { 494 nz = ai[i+1] - ai[i]; 495 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 496 offset += nz; 497 loTriFactor->AA_h[offset] = 1.0; 498 offset += 1; 499 v += nz; 500 } 501 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 502 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 503 } 504 } catch(char *ex) { 505 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 506 } 507 } 508 PetscFunctionReturn(0); 509 } 510 511 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 512 { 513 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 514 PetscInt n = A->rmap->n; 515 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 516 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 517 cusparseStatus_t stat; 518 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 519 const MatScalar *aa = a->a,*v; 520 PetscInt *AiUp, *AjUp; 521 PetscInt i,nz, nzUpper, offset; 522 PetscErrorCode ierr; 523 cudaError_t cerr; 524 525 PetscFunctionBegin; 526 if (!n) PetscFunctionReturn(0); 527 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 528 try { 529 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 530 nzUpper = adiag[0]-adiag[n]; 531 if (!upTriFactor) { 532 PetscScalar *AAUp; 533 534 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 535 536 /* Allocate Space for the upper triangular matrix */ 537 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 538 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 539 540 /* Fill the upper triangular matrix */ 541 AiUp[0]=(PetscInt) 0; 542 AiUp[n]=nzUpper; 543 offset = nzUpper; 544 for (i=n-1; i>=0; i--) { 545 v = aa + adiag[i+1] + 1; 546 vi = aj + adiag[i+1] + 1; 547 548 /* number of elements NOT on the diagonal */ 549 nz = adiag[i] - adiag[i+1]-1; 550 551 /* decrement the offset */ 552 offset -= (nz+1); 553 554 /* first, set the diagonal elements */ 555 AjUp[offset] = (PetscInt) i; 556 AAUp[offset] = (MatScalar)1./v[nz]; 557 AiUp[i] = AiUp[i+1] - (nz+1); 558 559 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 560 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 561 } 562 563 /* allocate space for the triangular factor information */ 564 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 565 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 566 567 /* Create the matrix description */ 568 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 569 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 570 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 571 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 572 #else 573 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 574 #endif 575 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 576 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 577 578 /* set the operation */ 579 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 580 581 /* set the matrix */ 582 upTriFactor->csrMat = new CsrMatrix; 583 upTriFactor->csrMat->num_rows = n; 584 upTriFactor->csrMat->num_cols = n; 585 upTriFactor->csrMat->num_entries = nzUpper; 586 587 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 588 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 589 590 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 591 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 592 593 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 594 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 595 596 /* Create the solve analysis information */ 597 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 598 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 599 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 600 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 601 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 602 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 603 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 604 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 605 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 606 #endif 607 608 /* perform the solve analysis */ 609 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 610 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 611 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 612 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 613 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 614 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 615 #endif 616 );CHKERRCUSPARSE(stat); 617 cerr = WaitForCUDA();CHKERRCUDA(cerr); 618 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 619 620 /* assign the pointer */ 621 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 622 upTriFactor->AA_h = AAUp; 623 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 624 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 625 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 626 } else { 627 if (!upTriFactor->AA_h) { 628 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 629 } 630 /* Fill the upper triangular matrix */ 631 offset = nzUpper; 632 for (i=n-1; i>=0; i--) { 633 v = aa + adiag[i+1] + 1; 634 635 /* number of elements NOT on the diagonal */ 636 nz = adiag[i] - adiag[i+1]-1; 637 638 /* decrement the offset */ 639 offset -= (nz+1); 640 641 /* first, set the diagonal elements */ 642 upTriFactor->AA_h[offset] = 1./v[nz]; 643 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 644 } 645 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 646 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 647 } 648 } catch(char *ex) { 649 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 650 } 651 } 652 PetscFunctionReturn(0); 653 } 654 655 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 656 { 657 PetscErrorCode ierr; 658 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 659 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 660 IS isrow = a->row,iscol = a->icol; 661 PetscBool row_identity,col_identity; 662 PetscInt n = A->rmap->n; 663 664 PetscFunctionBegin; 665 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 666 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 667 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 668 669 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 670 cusparseTriFactors->nnz=a->nz; 671 672 A->offloadmask = PETSC_OFFLOAD_BOTH; 673 /* lower triangular indices */ 674 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 675 if (!row_identity && !cusparseTriFactors->rpermIndices) { 676 const PetscInt *r; 677 678 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 679 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 680 cusparseTriFactors->rpermIndices->assign(r, r+n); 681 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 682 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 683 } 684 685 /* upper triangular indices */ 686 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 687 if (!col_identity && !cusparseTriFactors->cpermIndices) { 688 const PetscInt *c; 689 690 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 691 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 692 cusparseTriFactors->cpermIndices->assign(c, c+n); 693 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 694 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 695 } 696 PetscFunctionReturn(0); 697 } 698 699 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 700 { 701 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 702 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 703 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 704 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 705 cusparseStatus_t stat; 706 PetscErrorCode ierr; 707 cudaError_t cerr; 708 PetscInt *AiUp, *AjUp; 709 PetscScalar *AAUp; 710 PetscScalar *AALo; 711 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 712 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 713 const PetscInt *ai = b->i,*aj = b->j,*vj; 714 const MatScalar *aa = b->a,*v; 715 716 PetscFunctionBegin; 717 if (!n) PetscFunctionReturn(0); 718 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 719 try { 720 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 721 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 722 if (!upTriFactor && !loTriFactor) { 723 /* Allocate Space for the upper triangular matrix */ 724 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 725 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 726 727 /* Fill the upper triangular matrix */ 728 AiUp[0]=(PetscInt) 0; 729 AiUp[n]=nzUpper; 730 offset = 0; 731 for (i=0; i<n; i++) { 732 /* set the pointers */ 733 v = aa + ai[i]; 734 vj = aj + ai[i]; 735 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 736 737 /* first, set the diagonal elements */ 738 AjUp[offset] = (PetscInt) i; 739 AAUp[offset] = (MatScalar)1.0/v[nz]; 740 AiUp[i] = offset; 741 AALo[offset] = (MatScalar)1.0/v[nz]; 742 743 offset+=1; 744 if (nz>0) { 745 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 746 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 747 for (j=offset; j<offset+nz; j++) { 748 AAUp[j] = -AAUp[j]; 749 AALo[j] = AAUp[j]/v[nz]; 750 } 751 offset+=nz; 752 } 753 } 754 755 /* allocate space for the triangular factor information */ 756 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 757 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 758 759 /* Create the matrix description */ 760 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 761 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 762 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 763 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 764 #else 765 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 766 #endif 767 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 768 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 769 770 /* set the matrix */ 771 upTriFactor->csrMat = new CsrMatrix; 772 upTriFactor->csrMat->num_rows = A->rmap->n; 773 upTriFactor->csrMat->num_cols = A->cmap->n; 774 upTriFactor->csrMat->num_entries = a->nz; 775 776 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 777 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 778 779 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 780 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 781 782 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 783 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 784 785 /* set the operation */ 786 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 787 788 /* Create the solve analysis information */ 789 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 790 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 791 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 792 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 793 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 794 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 795 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 796 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 797 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 798 #endif 799 800 /* perform the solve analysis */ 801 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 802 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 803 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 804 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 805 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 806 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 807 #endif 808 );CHKERRCUSPARSE(stat); 809 cerr = WaitForCUDA();CHKERRCUDA(cerr); 810 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 811 812 /* assign the pointer */ 813 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 814 815 /* allocate space for the triangular factor information */ 816 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 817 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 818 819 /* Create the matrix description */ 820 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 821 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 822 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 823 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 824 #else 825 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 826 #endif 827 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 828 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 829 830 /* set the operation */ 831 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 832 833 /* set the matrix */ 834 loTriFactor->csrMat = new CsrMatrix; 835 loTriFactor->csrMat->num_rows = A->rmap->n; 836 loTriFactor->csrMat->num_cols = A->cmap->n; 837 loTriFactor->csrMat->num_entries = a->nz; 838 839 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 840 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 841 842 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 843 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 844 845 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 846 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 847 848 /* Create the solve analysis information */ 849 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 850 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 851 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 852 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 853 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 854 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 855 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 856 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 857 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 858 #endif 859 860 /* perform the solve analysis */ 861 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 862 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 863 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 864 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 865 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 866 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 867 #endif 868 );CHKERRCUSPARSE(stat); 869 cerr = WaitForCUDA();CHKERRCUDA(cerr); 870 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 871 872 /* assign the pointer */ 873 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 874 875 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 876 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 877 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 878 } else { 879 /* Fill the upper triangular matrix */ 880 offset = 0; 881 for (i=0; i<n; i++) { 882 /* set the pointers */ 883 v = aa + ai[i]; 884 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 885 886 /* first, set the diagonal elements */ 887 AAUp[offset] = 1.0/v[nz]; 888 AALo[offset] = 1.0/v[nz]; 889 890 offset+=1; 891 if (nz>0) { 892 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 893 for (j=offset; j<offset+nz; j++) { 894 AAUp[j] = -AAUp[j]; 895 AALo[j] = AAUp[j]/v[nz]; 896 } 897 offset+=nz; 898 } 899 } 900 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 901 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 902 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 903 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 904 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 905 } 906 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 907 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 908 } catch(char *ex) { 909 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 910 } 911 } 912 PetscFunctionReturn(0); 913 } 914 915 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 916 { 917 PetscErrorCode ierr; 918 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 919 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 920 IS ip = a->row; 921 PetscBool perm_identity; 922 PetscInt n = A->rmap->n; 923 924 PetscFunctionBegin; 925 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 926 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 927 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 928 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 929 930 A->offloadmask = PETSC_OFFLOAD_BOTH; 931 932 /* lower triangular indices */ 933 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 934 if (!perm_identity) { 935 IS iip; 936 const PetscInt *irip,*rip; 937 938 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 939 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 940 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 941 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 942 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 943 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 944 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 945 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 946 ierr = ISDestroy(&iip);CHKERRQ(ierr); 947 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 948 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 949 } 950 PetscFunctionReturn(0); 951 } 952 953 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 954 { 955 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 956 IS ip = b->row; 957 PetscBool perm_identity; 958 PetscErrorCode ierr; 959 960 PetscFunctionBegin; 961 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 962 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 963 B->offloadmask = PETSC_OFFLOAD_CPU; 964 /* determine which version of MatSolve needs to be used. */ 965 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 966 if (perm_identity) { 967 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 968 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 969 B->ops->matsolve = NULL; 970 B->ops->matsolvetranspose = NULL; 971 } else { 972 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 973 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 974 B->ops->matsolve = NULL; 975 B->ops->matsolvetranspose = NULL; 976 } 977 978 /* get the triangular factors */ 979 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 980 PetscFunctionReturn(0); 981 } 982 983 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 984 { 985 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 986 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 987 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 988 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 989 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 990 cusparseStatus_t stat; 991 cusparseIndexBase_t indexBase; 992 cusparseMatrixType_t matrixType; 993 cusparseFillMode_t fillMode; 994 cusparseDiagType_t diagType; 995 cudaError_t cerr; 996 PetscErrorCode ierr; 997 998 PetscFunctionBegin; 999 /* allocate space for the transpose of the lower triangular factor */ 1000 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1001 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1002 1003 /* set the matrix descriptors of the lower triangular factor */ 1004 matrixType = cusparseGetMatType(loTriFactor->descr); 1005 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1006 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1007 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1008 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1009 1010 /* Create the matrix description */ 1011 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1012 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1013 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1014 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1015 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1016 1017 /* set the operation */ 1018 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1019 1020 /* allocate GPU space for the CSC of the lower triangular factor*/ 1021 loTriFactorT->csrMat = new CsrMatrix; 1022 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1023 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1024 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1025 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1026 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1027 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1028 1029 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1030 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1031 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1032 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1033 loTriFactor->csrMat->values->data().get(), 1034 loTriFactor->csrMat->row_offsets->data().get(), 1035 loTriFactor->csrMat->column_indices->data().get(), 1036 loTriFactorT->csrMat->values->data().get(), 1037 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1038 CUSPARSE_ACTION_NUMERIC,indexBase, 1039 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1040 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1041 #endif 1042 1043 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1044 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1045 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1046 loTriFactor->csrMat->values->data().get(), 1047 loTriFactor->csrMat->row_offsets->data().get(), 1048 loTriFactor->csrMat->column_indices->data().get(), 1049 loTriFactorT->csrMat->values->data().get(), 1050 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1051 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1052 CUSPARSE_ACTION_NUMERIC, indexBase, 1053 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 1054 #else 1055 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1056 CUSPARSE_ACTION_NUMERIC, indexBase 1057 #endif 1058 );CHKERRCUSPARSE(stat); 1059 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1060 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1061 1062 /* Create the solve analysis information */ 1063 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1064 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1065 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1066 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1067 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1068 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1069 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1070 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1071 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1072 #endif 1073 1074 /* perform the solve analysis */ 1075 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1076 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1077 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1078 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 1079 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1080 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1081 #endif 1082 );CHKERRCUSPARSE(stat); 1083 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1084 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1085 1086 /* assign the pointer */ 1087 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1088 1089 /*********************************************/ 1090 /* Now the Transpose of the Upper Tri Factor */ 1091 /*********************************************/ 1092 1093 /* allocate space for the transpose of the upper triangular factor */ 1094 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1095 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1096 1097 /* set the matrix descriptors of the upper triangular factor */ 1098 matrixType = cusparseGetMatType(upTriFactor->descr); 1099 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1100 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1101 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1102 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1103 1104 /* Create the matrix description */ 1105 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1106 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1107 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1108 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1109 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1110 1111 /* set the operation */ 1112 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1113 1114 /* allocate GPU space for the CSC of the upper triangular factor*/ 1115 upTriFactorT->csrMat = new CsrMatrix; 1116 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1117 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1118 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1119 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1120 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1121 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1122 1123 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1124 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1125 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1126 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1127 upTriFactor->csrMat->values->data().get(), 1128 upTriFactor->csrMat->row_offsets->data().get(), 1129 upTriFactor->csrMat->column_indices->data().get(), 1130 upTriFactorT->csrMat->values->data().get(), 1131 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1132 CUSPARSE_ACTION_NUMERIC,indexBase, 1133 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1134 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1135 #endif 1136 1137 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1138 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1139 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1140 upTriFactor->csrMat->values->data().get(), 1141 upTriFactor->csrMat->row_offsets->data().get(), 1142 upTriFactor->csrMat->column_indices->data().get(), 1143 upTriFactorT->csrMat->values->data().get(), 1144 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1145 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1146 CUSPARSE_ACTION_NUMERIC, indexBase, 1147 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1148 #else 1149 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1150 CUSPARSE_ACTION_NUMERIC, indexBase 1151 #endif 1152 );CHKERRCUSPARSE(stat); 1153 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1154 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1155 1156 /* Create the solve analysis information */ 1157 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1158 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1159 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1160 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1161 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1162 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1163 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1164 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1165 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1166 #endif 1167 1168 /* perform the solve analysis */ 1169 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1170 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1171 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1172 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1173 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1174 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1175 #endif 1176 );CHKERRCUSPARSE(stat); 1177 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1178 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1179 1180 /* assign the pointer */ 1181 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1182 PetscFunctionReturn(0); 1183 } 1184 1185 struct PetscScalarToPetscInt 1186 { 1187 __host__ __device__ 1188 PetscInt operator()(PetscScalar s) 1189 { 1190 return (PetscInt)PetscRealPart(s); 1191 } 1192 }; 1193 1194 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A) 1195 { 1196 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1197 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1198 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1199 cusparseStatus_t stat; 1200 cusparseIndexBase_t indexBase; 1201 cudaError_t err; 1202 PetscErrorCode ierr; 1203 1204 PetscFunctionBegin; 1205 if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 1206 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1207 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1208 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1209 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1210 if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1211 if (A->transupdated) PetscFunctionReturn(0); 1212 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1213 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1214 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1215 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1216 } 1217 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1218 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1219 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1220 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1221 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1222 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1223 1224 /* set alpha and beta */ 1225 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1226 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1227 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1228 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1229 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1230 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1231 1232 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1233 CsrMatrix *matrixT = new CsrMatrix; 1234 matstructT->mat = matrixT; 1235 matrixT->num_rows = A->cmap->n; 1236 matrixT->num_cols = A->rmap->n; 1237 matrixT->num_entries = a->nz; 1238 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1239 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1240 matrixT->values = new THRUSTARRAY(a->nz); 1241 1242 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1243 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1244 1245 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1246 stat = cusparseCreateCsr(&matstructT->matDescr, 1247 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1248 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1249 matrixT->values->data().get(), 1250 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1251 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1252 #endif 1253 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1254 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1255 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1256 #else 1257 CsrMatrix *temp = new CsrMatrix; 1258 CsrMatrix *tempT = new CsrMatrix; 1259 /* First convert HYB to CSR */ 1260 temp->num_rows = A->rmap->n; 1261 temp->num_cols = A->cmap->n; 1262 temp->num_entries = a->nz; 1263 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1264 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1265 temp->values = new THRUSTARRAY(a->nz); 1266 1267 stat = cusparse_hyb2csr(cusparsestruct->handle, 1268 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1269 temp->values->data().get(), 1270 temp->row_offsets->data().get(), 1271 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1272 1273 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1274 tempT->num_rows = A->rmap->n; 1275 tempT->num_cols = A->cmap->n; 1276 tempT->num_entries = a->nz; 1277 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1278 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1279 tempT->values = new THRUSTARRAY(a->nz); 1280 1281 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1282 temp->num_cols, temp->num_entries, 1283 temp->values->data().get(), 1284 temp->row_offsets->data().get(), 1285 temp->column_indices->data().get(), 1286 tempT->values->data().get(), 1287 tempT->column_indices->data().get(), 1288 tempT->row_offsets->data().get(), 1289 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1290 1291 /* Last, convert CSC to HYB */ 1292 cusparseHybMat_t hybMat; 1293 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1294 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1295 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1296 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1297 matstructT->descr, tempT->values->data().get(), 1298 tempT->row_offsets->data().get(), 1299 tempT->column_indices->data().get(), 1300 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1301 1302 /* assign the pointer */ 1303 matstructT->mat = hybMat; 1304 A->transupdated = PETSC_TRUE; 1305 /* delete temporaries */ 1306 if (tempT) { 1307 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1308 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1309 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1310 delete (CsrMatrix*) tempT; 1311 } 1312 if (temp) { 1313 if (temp->values) delete (THRUSTARRAY*) temp->values; 1314 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1315 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1316 delete (CsrMatrix*) temp; 1317 } 1318 #endif 1319 } 1320 } 1321 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1322 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1323 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1324 if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1325 if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1326 if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1327 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1328 if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1329 if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1330 if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1331 if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1332 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1333 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1334 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1335 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1336 } 1337 if (!cusparsestruct->csr2csc_i) { 1338 THRUSTARRAY csr2csc_a(matrix->num_entries); 1339 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1340 1341 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1342 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1343 void *csr2cscBuffer; 1344 size_t csr2cscBufferSize; 1345 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1346 A->cmap->n, matrix->num_entries, 1347 matrix->values->data().get(), 1348 cusparsestruct->rowoffsets_gpu->data().get(), 1349 matrix->column_indices->data().get(), 1350 matrixT->values->data().get(), 1351 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1352 CUSPARSE_ACTION_NUMERIC,indexBase, 1353 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1354 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1355 #endif 1356 1357 if (matrix->num_entries) { 1358 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1359 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1360 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1361 1362 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1363 should be filled with indexBase. So I just take a shortcut here. 1364 */ 1365 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1366 A->cmap->n,matrix->num_entries, 1367 csr2csc_a.data().get(), 1368 cusparsestruct->rowoffsets_gpu->data().get(), 1369 matrix->column_indices->data().get(), 1370 matrixT->values->data().get(), 1371 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1372 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1373 CUSPARSE_ACTION_NUMERIC,indexBase, 1374 cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1375 #else 1376 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1377 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1378 #endif 1379 } else { 1380 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1381 } 1382 1383 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1384 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1385 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1386 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1387 #endif 1388 } 1389 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1390 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1391 matrixT->values->begin())); 1392 } 1393 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1394 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1395 /* the compressed row indices is not used for matTranspose */ 1396 matstructT->cprowIndices = NULL; 1397 /* assign the pointer */ 1398 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1399 A->transupdated = PETSC_TRUE; 1400 PetscFunctionReturn(0); 1401 } 1402 1403 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1404 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1405 { 1406 PetscInt n = xx->map->n; 1407 const PetscScalar *barray; 1408 PetscScalar *xarray; 1409 thrust::device_ptr<const PetscScalar> bGPU; 1410 thrust::device_ptr<PetscScalar> xGPU; 1411 cusparseStatus_t stat; 1412 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1413 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1414 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1415 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1416 PetscErrorCode ierr; 1417 1418 PetscFunctionBegin; 1419 /* Analyze the matrix and create the transpose ... on the fly */ 1420 if (!loTriFactorT && !upTriFactorT) { 1421 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1422 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1423 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1424 } 1425 1426 /* Get the GPU pointers */ 1427 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1428 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1429 xGPU = thrust::device_pointer_cast(xarray); 1430 bGPU = thrust::device_pointer_cast(barray); 1431 1432 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1433 /* First, reorder with the row permutation */ 1434 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1435 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1436 xGPU); 1437 1438 /* First, solve U */ 1439 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1440 upTriFactorT->csrMat->num_rows, 1441 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1442 upTriFactorT->csrMat->num_entries, 1443 #endif 1444 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1445 upTriFactorT->csrMat->values->data().get(), 1446 upTriFactorT->csrMat->row_offsets->data().get(), 1447 upTriFactorT->csrMat->column_indices->data().get(), 1448 upTriFactorT->solveInfo, 1449 xarray, tempGPU->data().get() 1450 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1451 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1452 #endif 1453 );CHKERRCUSPARSE(stat); 1454 1455 /* Then, solve L */ 1456 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1457 loTriFactorT->csrMat->num_rows, 1458 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1459 loTriFactorT->csrMat->num_entries, 1460 #endif 1461 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1462 loTriFactorT->csrMat->values->data().get(), 1463 loTriFactorT->csrMat->row_offsets->data().get(), 1464 loTriFactorT->csrMat->column_indices->data().get(), 1465 loTriFactorT->solveInfo, 1466 tempGPU->data().get(), xarray 1467 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1468 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1469 #endif 1470 );CHKERRCUSPARSE(stat); 1471 1472 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1473 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1474 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1475 tempGPU->begin()); 1476 1477 /* Copy the temporary to the full solution. */ 1478 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1479 1480 /* restore */ 1481 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1482 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1483 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1484 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1485 PetscFunctionReturn(0); 1486 } 1487 1488 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1489 { 1490 const PetscScalar *barray; 1491 PetscScalar *xarray; 1492 cusparseStatus_t stat; 1493 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1494 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1495 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1496 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1497 PetscErrorCode ierr; 1498 1499 PetscFunctionBegin; 1500 /* Analyze the matrix and create the transpose ... on the fly */ 1501 if (!loTriFactorT && !upTriFactorT) { 1502 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1503 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1504 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1505 } 1506 1507 /* Get the GPU pointers */ 1508 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1509 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1510 1511 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1512 /* First, solve U */ 1513 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1514 upTriFactorT->csrMat->num_rows, 1515 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1516 upTriFactorT->csrMat->num_entries, 1517 #endif 1518 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1519 upTriFactorT->csrMat->values->data().get(), 1520 upTriFactorT->csrMat->row_offsets->data().get(), 1521 upTriFactorT->csrMat->column_indices->data().get(), 1522 upTriFactorT->solveInfo, 1523 barray, tempGPU->data().get() 1524 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1525 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1526 #endif 1527 );CHKERRCUSPARSE(stat); 1528 1529 /* Then, solve L */ 1530 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1531 loTriFactorT->csrMat->num_rows, 1532 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1533 loTriFactorT->csrMat->num_entries, 1534 #endif 1535 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1536 loTriFactorT->csrMat->values->data().get(), 1537 loTriFactorT->csrMat->row_offsets->data().get(), 1538 loTriFactorT->csrMat->column_indices->data().get(), 1539 loTriFactorT->solveInfo, 1540 tempGPU->data().get(), xarray 1541 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1542 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1543 #endif 1544 );CHKERRCUSPARSE(stat); 1545 1546 /* restore */ 1547 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1548 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1549 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1550 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1551 PetscFunctionReturn(0); 1552 } 1553 1554 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1555 { 1556 const PetscScalar *barray; 1557 PetscScalar *xarray; 1558 thrust::device_ptr<const PetscScalar> bGPU; 1559 thrust::device_ptr<PetscScalar> xGPU; 1560 cusparseStatus_t stat; 1561 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1562 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1563 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1564 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1565 PetscErrorCode ierr; 1566 1567 PetscFunctionBegin; 1568 1569 /* Get the GPU pointers */ 1570 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1571 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1572 xGPU = thrust::device_pointer_cast(xarray); 1573 bGPU = thrust::device_pointer_cast(barray); 1574 1575 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1576 /* First, reorder with the row permutation */ 1577 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1578 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1579 tempGPU->begin()); 1580 1581 /* Next, solve L */ 1582 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1583 loTriFactor->csrMat->num_rows, 1584 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1585 loTriFactor->csrMat->num_entries, 1586 #endif 1587 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1588 loTriFactor->csrMat->values->data().get(), 1589 loTriFactor->csrMat->row_offsets->data().get(), 1590 loTriFactor->csrMat->column_indices->data().get(), 1591 loTriFactor->solveInfo, 1592 tempGPU->data().get(), xarray 1593 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1594 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1595 #endif 1596 );CHKERRCUSPARSE(stat); 1597 1598 /* Then, solve U */ 1599 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1600 upTriFactor->csrMat->num_rows, 1601 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1602 upTriFactor->csrMat->num_entries, 1603 #endif 1604 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1605 upTriFactor->csrMat->values->data().get(), 1606 upTriFactor->csrMat->row_offsets->data().get(), 1607 upTriFactor->csrMat->column_indices->data().get(), 1608 upTriFactor->solveInfo, 1609 xarray, tempGPU->data().get() 1610 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1611 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1612 #endif 1613 );CHKERRCUSPARSE(stat); 1614 1615 /* Last, reorder with the column permutation */ 1616 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1617 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1618 xGPU); 1619 1620 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1621 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1622 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1623 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1624 PetscFunctionReturn(0); 1625 } 1626 1627 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1628 { 1629 const PetscScalar *barray; 1630 PetscScalar *xarray; 1631 cusparseStatus_t stat; 1632 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1633 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1634 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1635 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1636 PetscErrorCode ierr; 1637 1638 PetscFunctionBegin; 1639 /* Get the GPU pointers */ 1640 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1641 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1642 1643 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1644 /* First, solve L */ 1645 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1646 loTriFactor->csrMat->num_rows, 1647 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1648 loTriFactor->csrMat->num_entries, 1649 #endif 1650 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1651 loTriFactor->csrMat->values->data().get(), 1652 loTriFactor->csrMat->row_offsets->data().get(), 1653 loTriFactor->csrMat->column_indices->data().get(), 1654 loTriFactor->solveInfo, 1655 barray, tempGPU->data().get() 1656 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1657 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1658 #endif 1659 );CHKERRCUSPARSE(stat); 1660 1661 /* Next, solve U */ 1662 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1663 upTriFactor->csrMat->num_rows, 1664 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1665 upTriFactor->csrMat->num_entries, 1666 #endif 1667 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1668 upTriFactor->csrMat->values->data().get(), 1669 upTriFactor->csrMat->row_offsets->data().get(), 1670 upTriFactor->csrMat->column_indices->data().get(), 1671 upTriFactor->solveInfo, 1672 tempGPU->data().get(), xarray 1673 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1674 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1675 #endif 1676 );CHKERRCUSPARSE(stat); 1677 1678 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1679 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1680 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1681 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1682 PetscFunctionReturn(0); 1683 } 1684 1685 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1686 { 1687 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1688 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1689 cudaError_t cerr; 1690 PetscErrorCode ierr; 1691 1692 PetscFunctionBegin; 1693 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1694 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1695 1696 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1697 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1698 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1699 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1700 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1701 A->offloadmask = PETSC_OFFLOAD_BOTH; 1702 } 1703 PetscFunctionReturn(0); 1704 } 1705 1706 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1707 { 1708 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1709 PetscErrorCode ierr; 1710 1711 PetscFunctionBegin; 1712 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1713 *array = a->a; 1714 A->offloadmask = PETSC_OFFLOAD_CPU; 1715 PetscFunctionReturn(0); 1716 } 1717 1718 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1719 { 1720 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1721 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1722 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1723 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1724 PetscErrorCode ierr; 1725 cusparseStatus_t stat; 1726 PetscBool both = PETSC_TRUE; 1727 cudaError_t err; 1728 1729 PetscFunctionBegin; 1730 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1731 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1732 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1733 CsrMatrix *matrix; 1734 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1735 1736 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1737 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1738 matrix->values->assign(a->a, a->a+a->nz); 1739 err = WaitForCUDA();CHKERRCUDA(err); 1740 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1741 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1742 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1743 } else { 1744 PetscInt nnz; 1745 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1746 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1747 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1748 delete cusparsestruct->workVector; 1749 delete cusparsestruct->rowoffsets_gpu; 1750 cusparsestruct->workVector = NULL; 1751 cusparsestruct->rowoffsets_gpu = NULL; 1752 try { 1753 if (a->compressedrow.use) { 1754 m = a->compressedrow.nrows; 1755 ii = a->compressedrow.i; 1756 ridx = a->compressedrow.rindex; 1757 } else { 1758 m = A->rmap->n; 1759 ii = a->i; 1760 ridx = NULL; 1761 } 1762 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1763 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1764 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1765 else nnz = a->nz; 1766 1767 /* create cusparse matrix */ 1768 cusparsestruct->nrows = m; 1769 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1770 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1771 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1772 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1773 1774 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1775 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1776 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1777 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1778 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1779 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1780 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1781 1782 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1783 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1784 /* set the matrix */ 1785 CsrMatrix *mat= new CsrMatrix; 1786 mat->num_rows = m; 1787 mat->num_cols = A->cmap->n; 1788 mat->num_entries = nnz; 1789 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1790 mat->row_offsets->assign(ii, ii + m+1); 1791 1792 mat->column_indices = new THRUSTINTARRAY32(nnz); 1793 mat->column_indices->assign(a->j, a->j+nnz); 1794 1795 mat->values = new THRUSTARRAY(nnz); 1796 if (a->a) mat->values->assign(a->a, a->a+nnz); 1797 1798 /* assign the pointer */ 1799 matstruct->mat = mat; 1800 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1801 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1802 stat = cusparseCreateCsr(&matstruct->matDescr, 1803 mat->num_rows, mat->num_cols, mat->num_entries, 1804 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1805 mat->values->data().get(), 1806 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1807 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1808 } 1809 #endif 1810 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1811 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1812 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1813 #else 1814 CsrMatrix *mat= new CsrMatrix; 1815 mat->num_rows = m; 1816 mat->num_cols = A->cmap->n; 1817 mat->num_entries = nnz; 1818 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1819 mat->row_offsets->assign(ii, ii + m+1); 1820 1821 mat->column_indices = new THRUSTINTARRAY32(nnz); 1822 mat->column_indices->assign(a->j, a->j+nnz); 1823 1824 mat->values = new THRUSTARRAY(nnz); 1825 if (a->a) mat->values->assign(a->a, a->a+nnz); 1826 1827 cusparseHybMat_t hybMat; 1828 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1829 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1830 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1831 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1832 matstruct->descr, mat->values->data().get(), 1833 mat->row_offsets->data().get(), 1834 mat->column_indices->data().get(), 1835 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1836 /* assign the pointer */ 1837 matstruct->mat = hybMat; 1838 1839 if (mat) { 1840 if (mat->values) delete (THRUSTARRAY*)mat->values; 1841 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1842 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1843 delete (CsrMatrix*)mat; 1844 } 1845 #endif 1846 } 1847 1848 /* assign the compressed row indices */ 1849 if (a->compressedrow.use) { 1850 cusparsestruct->workVector = new THRUSTARRAY(m); 1851 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1852 matstruct->cprowIndices->assign(ridx,ridx+m); 1853 tmp = m; 1854 } else { 1855 cusparsestruct->workVector = NULL; 1856 matstruct->cprowIndices = NULL; 1857 tmp = 0; 1858 } 1859 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1860 1861 /* assign the pointer */ 1862 cusparsestruct->mat = matstruct; 1863 } catch(char *ex) { 1864 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1865 } 1866 err = WaitForCUDA();CHKERRCUDA(err); 1867 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1868 cusparsestruct->nonzerostate = A->nonzerostate; 1869 } 1870 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1871 } 1872 PetscFunctionReturn(0); 1873 } 1874 1875 struct VecCUDAPlusEquals 1876 { 1877 template <typename Tuple> 1878 __host__ __device__ 1879 void operator()(Tuple t) 1880 { 1881 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1882 } 1883 }; 1884 1885 struct VecCUDAEquals 1886 { 1887 template <typename Tuple> 1888 __host__ __device__ 1889 void operator()(Tuple t) 1890 { 1891 thrust::get<1>(t) = thrust::get<0>(t); 1892 } 1893 }; 1894 1895 struct VecCUDAEqualsReverse 1896 { 1897 template <typename Tuple> 1898 __host__ __device__ 1899 void operator()(Tuple t) 1900 { 1901 thrust::get<0>(t) = thrust::get<1>(t); 1902 } 1903 }; 1904 1905 struct MatMatCusparse { 1906 PetscBool cisdense; 1907 PetscScalar *Bt; 1908 Mat X; 1909 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1910 PetscLogDouble flops; 1911 CsrMatrix *Bcsr; 1912 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1913 cusparseSpMatDescr_t matSpBDescr; 1914 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1915 cusparseDnMatDescr_t matBDescr; 1916 cusparseDnMatDescr_t matCDescr; 1917 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1918 size_t mmBufferSize; 1919 void *mmBuffer; 1920 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1921 cusparseSpGEMMDescr_t spgemmDesc; 1922 #endif 1923 }; 1924 1925 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1926 { 1927 PetscErrorCode ierr; 1928 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1929 cudaError_t cerr; 1930 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1931 cusparseStatus_t stat; 1932 #endif 1933 1934 PetscFunctionBegin; 1935 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1936 delete mmdata->Bcsr; 1937 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1938 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1939 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1940 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1941 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1942 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1943 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1944 #endif 1945 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1946 ierr = PetscFree(data);CHKERRQ(ierr); 1947 PetscFunctionReturn(0); 1948 } 1949 1950 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1951 1952 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1953 { 1954 Mat_Product *product = C->product; 1955 Mat A,B; 1956 PetscInt m,n,blda,clda; 1957 PetscBool flg,biscuda; 1958 Mat_SeqAIJCUSPARSE *cusp; 1959 cusparseStatus_t stat; 1960 cusparseOperation_t opA; 1961 const PetscScalar *barray; 1962 PetscScalar *carray; 1963 PetscErrorCode ierr; 1964 MatMatCusparse *mmdata; 1965 Mat_SeqAIJCUSPARSEMultStruct *mat; 1966 CsrMatrix *csrmat; 1967 1968 PetscFunctionBegin; 1969 MatCheckProduct(C,1); 1970 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 1971 mmdata = (MatMatCusparse*)product->data; 1972 A = product->A; 1973 B = product->B; 1974 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1975 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 1976 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1977 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1978 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1979 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1980 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1981 switch (product->type) { 1982 case MATPRODUCT_AB: 1983 case MATPRODUCT_PtAP: 1984 mat = cusp->mat; 1985 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1986 m = A->rmap->n; 1987 n = B->cmap->n; 1988 break; 1989 case MATPRODUCT_AtB: 1990 if (!A->form_explicit_transpose) { 1991 mat = cusp->mat; 1992 opA = CUSPARSE_OPERATION_TRANSPOSE; 1993 } else { 1994 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 1995 mat = cusp->matTranspose; 1996 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1997 } 1998 m = A->cmap->n; 1999 n = B->cmap->n; 2000 break; 2001 case MATPRODUCT_ABt: 2002 case MATPRODUCT_RARt: 2003 mat = cusp->mat; 2004 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2005 m = A->rmap->n; 2006 n = B->rmap->n; 2007 break; 2008 default: 2009 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2010 } 2011 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2012 csrmat = (CsrMatrix*)mat->mat; 2013 /* if the user passed a CPU matrix, copy the data to the GPU */ 2014 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2015 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2016 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2017 2018 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2019 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2020 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2021 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2022 } else { 2023 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2024 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2025 } 2026 2027 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2028 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2029 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2030 /* (re)allcoate mmBuffer if not initialized or LDAs are different */ 2031 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2032 size_t mmBufferSize; 2033 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2034 if (!mmdata->matBDescr) { 2035 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2036 mmdata->Blda = blda; 2037 } 2038 2039 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2040 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2041 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2042 mmdata->Clda = clda; 2043 } 2044 2045 if (!mat->matDescr) { 2046 stat = cusparseCreateCsr(&mat->matDescr, 2047 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2048 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2049 csrmat->values->data().get(), 2050 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2051 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2052 } 2053 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2054 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2055 mmdata->matCDescr,cusparse_scalartype, 2056 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2057 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2058 cudaError_t cerr; 2059 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2060 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2061 mmdata->mmBufferSize = mmBufferSize; 2062 } 2063 mmdata->initialized = PETSC_TRUE; 2064 } else { 2065 /* to be safe, always update pointers of the mats */ 2066 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2067 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2068 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2069 } 2070 2071 /* do cusparseSpMM, which supports transpose on B */ 2072 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2073 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2074 mmdata->matCDescr,cusparse_scalartype, 2075 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2076 #else 2077 PetscInt k; 2078 /* cusparseXcsrmm does not support transpose on B */ 2079 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2080 cublasHandle_t cublasv2handle; 2081 cublasStatus_t cerr; 2082 2083 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2084 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2085 B->cmap->n,B->rmap->n, 2086 &PETSC_CUSPARSE_ONE ,barray,blda, 2087 &PETSC_CUSPARSE_ZERO,barray,blda, 2088 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2089 blda = B->cmap->n; 2090 k = B->cmap->n; 2091 } else { 2092 k = B->rmap->n; 2093 } 2094 2095 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2096 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2097 csrmat->num_entries,mat->alpha_one,mat->descr, 2098 csrmat->values->data().get(), 2099 csrmat->row_offsets->data().get(), 2100 csrmat->column_indices->data().get(), 2101 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2102 carray,clda);CHKERRCUSPARSE(stat); 2103 #endif 2104 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2105 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2106 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2107 if (product->type == MATPRODUCT_RARt) { 2108 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2109 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2110 } else if (product->type == MATPRODUCT_PtAP) { 2111 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2112 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2113 } else { 2114 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2115 } 2116 if (mmdata->cisdense) { 2117 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2118 } 2119 if (!biscuda) { 2120 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2121 } 2122 PetscFunctionReturn(0); 2123 } 2124 2125 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2126 { 2127 Mat_Product *product = C->product; 2128 Mat A,B; 2129 PetscInt m,n; 2130 PetscBool cisdense,flg; 2131 PetscErrorCode ierr; 2132 MatMatCusparse *mmdata; 2133 Mat_SeqAIJCUSPARSE *cusp; 2134 2135 PetscFunctionBegin; 2136 MatCheckProduct(C,1); 2137 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2138 A = product->A; 2139 B = product->B; 2140 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2141 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2142 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2143 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2144 switch (product->type) { 2145 case MATPRODUCT_AB: 2146 m = A->rmap->n; 2147 n = B->cmap->n; 2148 break; 2149 case MATPRODUCT_AtB: 2150 m = A->cmap->n; 2151 n = B->cmap->n; 2152 break; 2153 case MATPRODUCT_ABt: 2154 m = A->rmap->n; 2155 n = B->rmap->n; 2156 break; 2157 case MATPRODUCT_PtAP: 2158 m = B->cmap->n; 2159 n = B->cmap->n; 2160 break; 2161 case MATPRODUCT_RARt: 2162 m = B->rmap->n; 2163 n = B->rmap->n; 2164 break; 2165 default: 2166 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2167 } 2168 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2169 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2170 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2171 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2172 2173 /* product data */ 2174 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2175 mmdata->cisdense = cisdense; 2176 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2177 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2178 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2179 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2180 } 2181 #endif 2182 /* for these products we need intermediate storage */ 2183 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2184 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2185 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2186 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2187 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2188 } else { 2189 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2190 } 2191 } 2192 C->product->data = mmdata; 2193 C->product->destroy = MatDestroy_MatMatCusparse; 2194 2195 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2196 PetscFunctionReturn(0); 2197 } 2198 2199 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2200 { 2201 Mat_Product *product = C->product; 2202 Mat A,B; 2203 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2204 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2205 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2206 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2207 PetscBool flg; 2208 PetscErrorCode ierr; 2209 cusparseStatus_t stat; 2210 cudaError_t cerr; 2211 MatProductType ptype; 2212 MatMatCusparse *mmdata; 2213 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2214 cusparseSpMatDescr_t BmatSpDescr; 2215 #endif 2216 2217 PetscFunctionBegin; 2218 MatCheckProduct(C,1); 2219 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2220 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2221 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2222 mmdata = (MatMatCusparse*)C->product->data; 2223 A = product->A; 2224 B = product->B; 2225 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2226 mmdata->reusesym = PETSC_FALSE; 2227 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2228 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2229 Cmat = Ccusp->mat; 2230 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2231 Ccsr = (CsrMatrix*)Cmat->mat; 2232 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2233 goto finalize; 2234 } 2235 if (!c->nz) goto finalize; 2236 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2237 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2238 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2239 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2240 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2241 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2242 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2243 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2244 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2245 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2246 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2247 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2248 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2249 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2250 2251 ptype = product->type; 2252 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2253 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2254 switch (ptype) { 2255 case MATPRODUCT_AB: 2256 Amat = Acusp->mat; 2257 Bmat = Bcusp->mat; 2258 break; 2259 case MATPRODUCT_AtB: 2260 Amat = Acusp->matTranspose; 2261 Bmat = Bcusp->mat; 2262 break; 2263 case MATPRODUCT_ABt: 2264 Amat = Acusp->mat; 2265 Bmat = Bcusp->matTranspose; 2266 break; 2267 default: 2268 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2269 } 2270 Cmat = Ccusp->mat; 2271 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2272 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2273 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2274 Acsr = (CsrMatrix*)Amat->mat; 2275 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2276 Ccsr = (CsrMatrix*)Cmat->mat; 2277 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2278 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2279 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2280 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2281 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2282 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2283 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2284 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2285 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2286 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2287 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2288 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2289 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2290 #else 2291 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2292 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2293 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2294 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2295 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2296 #endif 2297 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2298 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2299 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2300 C->offloadmask = PETSC_OFFLOAD_GPU; 2301 finalize: 2302 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2303 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2304 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2305 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2306 c->reallocs = 0; 2307 C->info.mallocs += 0; 2308 C->info.nz_unneeded = 0; 2309 C->assembled = C->was_assembled = PETSC_TRUE; 2310 C->num_ass++; 2311 PetscFunctionReturn(0); 2312 } 2313 2314 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2315 { 2316 Mat_Product *product = C->product; 2317 Mat A,B; 2318 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2319 Mat_SeqAIJ *a,*b,*c; 2320 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2321 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2322 PetscInt i,j,m,n,k; 2323 PetscBool flg; 2324 PetscErrorCode ierr; 2325 cusparseStatus_t stat; 2326 cudaError_t cerr; 2327 MatProductType ptype; 2328 MatMatCusparse *mmdata; 2329 PetscLogDouble flops; 2330 PetscBool biscompressed,ciscompressed; 2331 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2332 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2333 size_t bufSize2; 2334 cusparseSpMatDescr_t BmatSpDescr; 2335 #else 2336 int cnz; 2337 #endif 2338 2339 PetscFunctionBegin; 2340 MatCheckProduct(C,1); 2341 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2342 A = product->A; 2343 B = product->B; 2344 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2345 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2346 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2347 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2348 a = (Mat_SeqAIJ*)A->data; 2349 b = (Mat_SeqAIJ*)B->data; 2350 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2351 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2352 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2353 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2354 2355 /* product data */ 2356 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2357 C->product->data = mmdata; 2358 C->product->destroy = MatDestroy_MatMatCusparse; 2359 2360 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2361 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2362 ptype = product->type; 2363 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2364 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2365 biscompressed = PETSC_FALSE; 2366 ciscompressed = PETSC_FALSE; 2367 switch (ptype) { 2368 case MATPRODUCT_AB: 2369 m = A->rmap->n; 2370 n = B->cmap->n; 2371 k = A->cmap->n; 2372 Amat = Acusp->mat; 2373 Bmat = Bcusp->mat; 2374 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2375 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2376 break; 2377 case MATPRODUCT_AtB: 2378 m = A->cmap->n; 2379 n = B->cmap->n; 2380 k = A->rmap->n; 2381 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2382 Amat = Acusp->matTranspose; 2383 Bmat = Bcusp->mat; 2384 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2385 break; 2386 case MATPRODUCT_ABt: 2387 m = A->rmap->n; 2388 n = B->rmap->n; 2389 k = A->cmap->n; 2390 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 2391 Amat = Acusp->mat; 2392 Bmat = Bcusp->matTranspose; 2393 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2394 break; 2395 default: 2396 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2397 } 2398 2399 /* create cusparse matrix */ 2400 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2401 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2402 c = (Mat_SeqAIJ*)C->data; 2403 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2404 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2405 Ccsr = new CsrMatrix; 2406 2407 c->compressedrow.use = ciscompressed; 2408 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2409 c->compressedrow.nrows = a->compressedrow.nrows; 2410 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2411 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2412 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2413 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2414 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2415 } else { 2416 c->compressedrow.nrows = 0; 2417 c->compressedrow.i = NULL; 2418 c->compressedrow.rindex = NULL; 2419 Ccusp->workVector = NULL; 2420 Cmat->cprowIndices = NULL; 2421 } 2422 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2423 Ccusp->mat = Cmat; 2424 Ccusp->mat->mat = Ccsr; 2425 Ccsr->num_rows = Ccusp->nrows; 2426 Ccsr->num_cols = n; 2427 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2428 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2429 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2430 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2431 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2432 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2433 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2434 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2435 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2436 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2437 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2438 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2439 c->nz = 0; 2440 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2441 Ccsr->values = new THRUSTARRAY(c->nz); 2442 goto finalizesym; 2443 } 2444 2445 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2446 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2447 Acsr = (CsrMatrix*)Amat->mat; 2448 if (!biscompressed) { 2449 Bcsr = (CsrMatrix*)Bmat->mat; 2450 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2451 BmatSpDescr = Bmat->matDescr; 2452 #endif 2453 } else { /* we need to use row offsets for the full matrix */ 2454 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2455 Bcsr = new CsrMatrix; 2456 Bcsr->num_rows = B->rmap->n; 2457 Bcsr->num_cols = cBcsr->num_cols; 2458 Bcsr->num_entries = cBcsr->num_entries; 2459 Bcsr->column_indices = cBcsr->column_indices; 2460 Bcsr->values = cBcsr->values; 2461 if (!Bcusp->rowoffsets_gpu) { 2462 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2463 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2464 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2465 } 2466 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2467 mmdata->Bcsr = Bcsr; 2468 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2469 if (Bcsr->num_rows && Bcsr->num_cols) { 2470 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2471 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2472 Bcsr->values->data().get(), 2473 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2474 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2475 } 2476 BmatSpDescr = mmdata->matSpBDescr; 2477 #endif 2478 } 2479 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2480 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2481 /* precompute flops count */ 2482 if (ptype == MATPRODUCT_AB) { 2483 for (i=0, flops = 0; i<A->rmap->n; i++) { 2484 const PetscInt st = a->i[i]; 2485 const PetscInt en = a->i[i+1]; 2486 for (j=st; j<en; j++) { 2487 const PetscInt brow = a->j[j]; 2488 flops += 2.*(b->i[brow+1] - b->i[brow]); 2489 } 2490 } 2491 } else if (ptype == MATPRODUCT_AtB) { 2492 for (i=0, flops = 0; i<A->rmap->n; i++) { 2493 const PetscInt anzi = a->i[i+1] - a->i[i]; 2494 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2495 flops += (2.*anzi)*bnzi; 2496 } 2497 } else { /* TODO */ 2498 flops = 0.; 2499 } 2500 2501 mmdata->flops = flops; 2502 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2503 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2504 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2505 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2506 NULL, NULL, NULL, 2507 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2508 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2509 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2510 /* ask bufferSize bytes for external memory */ 2511 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2512 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2513 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2514 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2515 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2516 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2517 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2518 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2519 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2520 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2521 /* ask bufferSize again bytes for external memory */ 2522 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2523 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2524 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2525 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2526 /* The CUSPARSE documentation is not clear, nor the API 2527 We need both buffers to perform the operations properly! 2528 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2529 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2530 is stored in the descriptor! What a messy API... */ 2531 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2532 /* compute the intermediate product of A * B */ 2533 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2534 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2535 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2536 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2537 /* get matrix C non-zero entries C_nnz1 */ 2538 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2539 c->nz = (PetscInt) C_nnz1; 2540 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2541 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2542 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2543 Ccsr->values = new THRUSTARRAY(c->nz); 2544 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2545 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2546 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2547 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2548 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2549 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2550 #else 2551 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2552 stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2553 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2554 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2555 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2556 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2557 c->nz = cnz; 2558 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2559 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2560 Ccsr->values = new THRUSTARRAY(c->nz); 2561 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2562 2563 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2564 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2565 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2566 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2567 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2568 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2569 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2570 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2571 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2572 #endif 2573 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2574 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2575 finalizesym: 2576 c->singlemalloc = PETSC_FALSE; 2577 c->free_a = PETSC_TRUE; 2578 c->free_ij = PETSC_TRUE; 2579 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2580 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2581 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2582 PetscInt *d_i = c->i; 2583 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2584 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2585 ii = *Ccsr->row_offsets; 2586 jj = *Ccsr->column_indices; 2587 if (ciscompressed) d_i = c->compressedrow.i; 2588 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2589 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2590 } else { 2591 PetscInt *d_i = c->i; 2592 if (ciscompressed) d_i = c->compressedrow.i; 2593 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2594 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2595 } 2596 if (ciscompressed) { /* need to expand host row offsets */ 2597 PetscInt r = 0; 2598 c->i[0] = 0; 2599 for (k = 0; k < c->compressedrow.nrows; k++) { 2600 const PetscInt next = c->compressedrow.rindex[k]; 2601 const PetscInt old = c->compressedrow.i[k]; 2602 for (; r < next; r++) c->i[r+1] = old; 2603 } 2604 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2605 } 2606 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2607 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2608 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2609 c->maxnz = c->nz; 2610 c->nonzerorowcnt = 0; 2611 c->rmax = 0; 2612 for (k = 0; k < m; k++) { 2613 const PetscInt nn = c->i[k+1] - c->i[k]; 2614 c->ilen[k] = c->imax[k] = nn; 2615 c->nonzerorowcnt += (PetscInt)!!nn; 2616 c->rmax = PetscMax(c->rmax,nn); 2617 } 2618 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2619 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2620 Ccsr->num_entries = c->nz; 2621 2622 C->nonzerostate++; 2623 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2624 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2625 Ccusp->nonzerostate = C->nonzerostate; 2626 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2627 C->preallocated = PETSC_TRUE; 2628 C->assembled = PETSC_FALSE; 2629 C->was_assembled = PETSC_FALSE; 2630 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2631 mmdata->reusesym = PETSC_TRUE; 2632 C->offloadmask = PETSC_OFFLOAD_GPU; 2633 } 2634 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2635 PetscFunctionReturn(0); 2636 } 2637 2638 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2639 2640 /* handles sparse or dense B */ 2641 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2642 { 2643 Mat_Product *product = mat->product; 2644 PetscErrorCode ierr; 2645 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2646 2647 PetscFunctionBegin; 2648 MatCheckProduct(mat,1); 2649 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2650 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2651 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2652 } 2653 if (product->type == MATPRODUCT_ABC) { 2654 Ciscusp = PETSC_FALSE; 2655 if (!product->C->boundtocpu) { 2656 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2657 } 2658 } 2659 if (isdense) { 2660 switch (product->type) { 2661 case MATPRODUCT_AB: 2662 case MATPRODUCT_AtB: 2663 case MATPRODUCT_ABt: 2664 case MATPRODUCT_PtAP: 2665 case MATPRODUCT_RARt: 2666 if (product->A->boundtocpu) { 2667 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2668 } else { 2669 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2670 } 2671 break; 2672 case MATPRODUCT_ABC: 2673 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2674 break; 2675 default: 2676 break; 2677 } 2678 } else if (Biscusp && Ciscusp) { 2679 switch (product->type) { 2680 case MATPRODUCT_AB: 2681 case MATPRODUCT_AtB: 2682 case MATPRODUCT_ABt: 2683 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2684 break; 2685 case MATPRODUCT_PtAP: 2686 case MATPRODUCT_RARt: 2687 case MATPRODUCT_ABC: 2688 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2689 break; 2690 default: 2691 break; 2692 } 2693 } else { /* fallback for AIJ */ 2694 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2695 } 2696 PetscFunctionReturn(0); 2697 } 2698 2699 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2700 { 2701 PetscErrorCode ierr; 2702 2703 PetscFunctionBegin; 2704 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2705 PetscFunctionReturn(0); 2706 } 2707 2708 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2709 { 2710 PetscErrorCode ierr; 2711 2712 PetscFunctionBegin; 2713 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2714 PetscFunctionReturn(0); 2715 } 2716 2717 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2718 { 2719 PetscErrorCode ierr; 2720 2721 PetscFunctionBegin; 2722 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2723 PetscFunctionReturn(0); 2724 } 2725 2726 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2727 { 2728 PetscErrorCode ierr; 2729 2730 PetscFunctionBegin; 2731 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2732 PetscFunctionReturn(0); 2733 } 2734 2735 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2736 { 2737 PetscErrorCode ierr; 2738 2739 PetscFunctionBegin; 2740 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2741 PetscFunctionReturn(0); 2742 } 2743 2744 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2745 { 2746 int i = blockIdx.x*blockDim.x + threadIdx.x; 2747 if (i < n) y[idx[i]] += x[i]; 2748 } 2749 2750 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2751 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2752 { 2753 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2754 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2755 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2756 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2757 PetscErrorCode ierr; 2758 cusparseStatus_t stat; 2759 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2760 PetscBool compressed; 2761 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2762 PetscInt nx,ny; 2763 #endif 2764 2765 PetscFunctionBegin; 2766 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 2767 if (!a->nonzerorowcnt) { 2768 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2769 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2770 PetscFunctionReturn(0); 2771 } 2772 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2773 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2774 if (!trans) { 2775 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2776 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2777 } else { 2778 if (herm || !A->form_explicit_transpose) { 2779 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2780 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2781 } else { 2782 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);} 2783 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2784 } 2785 } 2786 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2787 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2788 2789 try { 2790 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2791 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2792 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2793 2794 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2795 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2796 /* z = A x + beta y. 2797 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2798 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2799 */ 2800 xptr = xarray; 2801 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2802 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2803 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2804 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2805 allocated to accommodate different uses. So we get the length info directly from mat. 2806 */ 2807 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2808 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2809 nx = mat->num_cols; 2810 ny = mat->num_rows; 2811 } 2812 #endif 2813 } else { 2814 /* z = A^T x + beta y 2815 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2816 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2817 */ 2818 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2819 dptr = zarray; 2820 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2821 if (compressed) { /* Scatter x to work vector */ 2822 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2823 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2824 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2825 VecCUDAEqualsReverse()); 2826 } 2827 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2828 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2829 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2830 nx = mat->num_rows; 2831 ny = mat->num_cols; 2832 } 2833 #endif 2834 } 2835 2836 /* csr_spmv does y = alpha op(A) x + beta y */ 2837 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2838 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2839 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2840 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2841 cudaError_t cerr; 2842 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2843 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2844 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2845 matstruct->matDescr, 2846 matstruct->cuSpMV[opA].vecXDescr, beta, 2847 matstruct->cuSpMV[opA].vecYDescr, 2848 cusparse_scalartype, 2849 cusparsestruct->spmvAlg, 2850 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2851 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2852 2853 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2854 } else { 2855 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2856 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2857 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2858 } 2859 2860 stat = cusparseSpMV(cusparsestruct->handle, opA, 2861 matstruct->alpha_one, 2862 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */ 2863 matstruct->cuSpMV[opA].vecXDescr, 2864 beta, 2865 matstruct->cuSpMV[opA].vecYDescr, 2866 cusparse_scalartype, 2867 cusparsestruct->spmvAlg, 2868 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2869 #else 2870 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2871 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2872 mat->num_rows, mat->num_cols, 2873 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2874 mat->values->data().get(), mat->row_offsets->data().get(), 2875 mat->column_indices->data().get(), xptr, beta, 2876 dptr);CHKERRCUSPARSE(stat); 2877 #endif 2878 } else { 2879 if (cusparsestruct->nrows) { 2880 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2881 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2882 #else 2883 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2884 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2885 matstruct->alpha_one, matstruct->descr, hybMat, 2886 xptr, beta, 2887 dptr);CHKERRCUSPARSE(stat); 2888 #endif 2889 } 2890 } 2891 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2892 2893 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2894 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2895 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2896 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2897 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2898 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2899 } 2900 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2901 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2902 } 2903 2904 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2905 if (compressed) { 2906 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2907 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 2908 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 2909 prevent that. So I just add a ScatterAdd kernel. 2910 */ 2911 #if 0 2912 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2913 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 2914 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2915 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2916 VecCUDAPlusEquals()); 2917 #else 2918 PetscInt n = matstruct->cprowIndices->size(); 2919 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 2920 #endif 2921 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2922 } 2923 } else { 2924 if (yy && yy != zz) { 2925 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2926 } 2927 } 2928 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2929 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2930 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2931 } catch(char *ex) { 2932 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2933 } 2934 if (yy) { 2935 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2936 } else { 2937 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2938 } 2939 PetscFunctionReturn(0); 2940 } 2941 2942 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2943 { 2944 PetscErrorCode ierr; 2945 2946 PetscFunctionBegin; 2947 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2948 PetscFunctionReturn(0); 2949 } 2950 2951 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2952 { 2953 PetscErrorCode ierr; 2954 PetscSplitCSRDataStructure *d_mat = NULL; 2955 PetscFunctionBegin; 2956 if (A->factortype == MAT_FACTOR_NONE) { 2957 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2958 } 2959 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2960 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2961 if (d_mat) { 2962 A->offloadmask = PETSC_OFFLOAD_GPU; 2963 } 2964 2965 PetscFunctionReturn(0); 2966 } 2967 2968 /* --------------------------------------------------------------------------------*/ 2969 /*@ 2970 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2971 (the default parallel PETSc format). This matrix will ultimately pushed down 2972 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2973 assembly performance the user should preallocate the matrix storage by setting 2974 the parameter nz (or the array nnz). By setting these parameters accurately, 2975 performance during matrix assembly can be increased by more than a factor of 50. 2976 2977 Collective 2978 2979 Input Parameters: 2980 + comm - MPI communicator, set to PETSC_COMM_SELF 2981 . m - number of rows 2982 . n - number of columns 2983 . nz - number of nonzeros per row (same for all rows) 2984 - nnz - array containing the number of nonzeros in the various rows 2985 (possibly different for each row) or NULL 2986 2987 Output Parameter: 2988 . A - the matrix 2989 2990 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2991 MatXXXXSetPreallocation() paradgm instead of this routine directly. 2992 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 2993 2994 Notes: 2995 If nnz is given then nz is ignored 2996 2997 The AIJ format (also called the Yale sparse matrix format or 2998 compressed row storage), is fully compatible with standard Fortran 77 2999 storage. That is, the stored row and column indices can begin at 3000 either one (as in Fortran) or zero. See the users' manual for details. 3001 3002 Specify the preallocated storage with either nz or nnz (not both). 3003 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3004 allocation. For large problems you MUST preallocate memory or you 3005 will get TERRIBLE performance, see the users' manual chapter on matrices. 3006 3007 By default, this format uses inodes (identical nodes) when possible, to 3008 improve numerical efficiency of matrix-vector products and solves. We 3009 search for consecutive rows with the same nonzero structure, thereby 3010 reusing matrix information to achieve increased efficiency. 3011 3012 Level: intermediate 3013 3014 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3015 @*/ 3016 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3017 { 3018 PetscErrorCode ierr; 3019 3020 PetscFunctionBegin; 3021 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3022 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3023 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3024 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3025 PetscFunctionReturn(0); 3026 } 3027 3028 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3029 { 3030 PetscErrorCode ierr; 3031 PetscSplitCSRDataStructure *d_mat = NULL; 3032 3033 PetscFunctionBegin; 3034 if (A->factortype == MAT_FACTOR_NONE) { 3035 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 3036 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 3037 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3038 } else { 3039 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3040 } 3041 if (d_mat) { 3042 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3043 cudaError_t err; 3044 PetscSplitCSRDataStructure h_mat; 3045 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 3046 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 3047 if (a->compressedrow.use) { 3048 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 3049 } 3050 err = cudaFree(d_mat);CHKERRCUDA(err); 3051 } 3052 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3053 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3054 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3055 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3056 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3057 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3058 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3059 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3060 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3061 PetscFunctionReturn(0); 3062 } 3063 3064 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3065 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3066 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3067 { 3068 PetscErrorCode ierr; 3069 3070 PetscFunctionBegin; 3071 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3072 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3073 PetscFunctionReturn(0); 3074 } 3075 3076 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3077 { 3078 PetscErrorCode ierr; 3079 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3080 Mat_SeqAIJCUSPARSE *cy; 3081 Mat_SeqAIJCUSPARSE *cx; 3082 PetscScalar *ay; 3083 const PetscScalar *ax; 3084 CsrMatrix *csry,*csrx; 3085 3086 PetscFunctionBegin; 3087 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3088 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3089 if (X->ops->axpy != Y->ops->axpy) { 3090 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3091 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3092 PetscFunctionReturn(0); 3093 } 3094 /* if we are here, it means both matrices are bound to GPU */ 3095 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3096 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3097 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3098 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3099 csry = (CsrMatrix*)cy->mat->mat; 3100 csrx = (CsrMatrix*)cx->mat->mat; 3101 /* see if we can turn this into a cublas axpy */ 3102 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3103 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3104 if (eq) { 3105 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3106 } 3107 if (eq) str = SAME_NONZERO_PATTERN; 3108 } 3109 /* spgeam is buggy with one column */ 3110 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3111 3112 if (str == SUBSET_NONZERO_PATTERN) { 3113 cusparseStatus_t stat; 3114 PetscScalar b = 1.0; 3115 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3116 size_t bufferSize; 3117 void *buffer; 3118 cudaError_t cerr; 3119 #endif 3120 3121 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3122 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3123 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3124 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3125 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3126 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3127 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3128 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3129 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3130 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3131 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3132 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3133 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3134 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3135 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3136 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3137 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3138 #else 3139 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3140 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3141 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3142 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3143 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3144 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3145 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3146 #endif 3147 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3148 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3149 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3150 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3151 } else if (str == SAME_NONZERO_PATTERN) { 3152 cublasHandle_t cublasv2handle; 3153 cublasStatus_t berr; 3154 PetscBLASInt one = 1, bnz = 1; 3155 3156 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3157 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3158 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3159 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3160 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3161 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3162 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3163 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3164 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3165 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3166 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3167 } else { 3168 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3169 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3170 } 3171 PetscFunctionReturn(0); 3172 } 3173 3174 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3175 { 3176 PetscErrorCode ierr; 3177 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3178 PetscScalar *ay; 3179 cublasHandle_t cublasv2handle; 3180 cublasStatus_t berr; 3181 PetscBLASInt one = 1, bnz = 1; 3182 3183 PetscFunctionBegin; 3184 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3185 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3186 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3187 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3188 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3189 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3190 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3191 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3192 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3193 PetscFunctionReturn(0); 3194 } 3195 3196 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3197 { 3198 PetscErrorCode ierr; 3199 PetscBool both = PETSC_FALSE; 3200 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3201 3202 PetscFunctionBegin; 3203 if (A->factortype == MAT_FACTOR_NONE) { 3204 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3205 if (spptr->mat) { 3206 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3207 if (matrix->values) { 3208 both = PETSC_TRUE; 3209 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3210 } 3211 } 3212 if (spptr->matTranspose) { 3213 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3214 if (matrix->values) { 3215 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3216 } 3217 } 3218 } 3219 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3220 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3221 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3222 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3223 else A->offloadmask = PETSC_OFFLOAD_CPU; 3224 3225 PetscFunctionReturn(0); 3226 } 3227 3228 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3229 { 3230 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3231 PetscErrorCode ierr; 3232 3233 PetscFunctionBegin; 3234 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3235 if (flg) { 3236 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3237 3238 A->ops->scale = MatScale_SeqAIJ; 3239 A->ops->axpy = MatAXPY_SeqAIJ; 3240 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3241 A->ops->mult = MatMult_SeqAIJ; 3242 A->ops->multadd = MatMultAdd_SeqAIJ; 3243 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3244 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3245 A->ops->multhermitiantranspose = NULL; 3246 A->ops->multhermitiantransposeadd = NULL; 3247 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3248 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3249 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3250 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3251 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3252 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3253 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3254 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3255 } else { 3256 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3257 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3258 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3259 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3260 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3261 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3262 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3263 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3264 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3265 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3266 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3267 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3268 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3269 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3270 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3271 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3272 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3273 } 3274 A->boundtocpu = flg; 3275 a->inode.use = flg; 3276 PetscFunctionReturn(0); 3277 } 3278 3279 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3280 { 3281 PetscErrorCode ierr; 3282 cusparseStatus_t stat; 3283 Mat B; 3284 3285 PetscFunctionBegin; 3286 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3287 if (reuse == MAT_INITIAL_MATRIX) { 3288 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3289 } else if (reuse == MAT_REUSE_MATRIX) { 3290 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3291 } 3292 B = *newmat; 3293 3294 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3295 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3296 3297 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3298 if (B->factortype == MAT_FACTOR_NONE) { 3299 Mat_SeqAIJCUSPARSE *spptr; 3300 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3301 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3302 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3303 spptr->format = MAT_CUSPARSE_CSR; 3304 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3305 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3306 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3307 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3308 #endif 3309 B->spptr = spptr; 3310 } else { 3311 Mat_SeqAIJCUSPARSETriFactors *spptr; 3312 3313 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3314 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3315 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3316 B->spptr = spptr; 3317 } 3318 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3319 } 3320 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3321 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3322 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3323 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3324 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3325 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3326 3327 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3328 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3329 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3330 PetscFunctionReturn(0); 3331 } 3332 3333 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3334 { 3335 PetscErrorCode ierr; 3336 3337 PetscFunctionBegin; 3338 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3339 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3340 PetscFunctionReturn(0); 3341 } 3342 3343 /*MC 3344 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3345 3346 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3347 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3348 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3349 3350 Options Database Keys: 3351 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3352 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3353 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3354 3355 Level: beginner 3356 3357 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3358 M*/ 3359 3360 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3361 3362 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3363 { 3364 PetscErrorCode ierr; 3365 3366 PetscFunctionBegin; 3367 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 3368 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3369 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3370 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3371 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3372 3373 PetscFunctionReturn(0); 3374 } 3375 3376 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3377 { 3378 PetscErrorCode ierr; 3379 cusparseStatus_t stat; 3380 3381 PetscFunctionBegin; 3382 if (*cusparsestruct) { 3383 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3384 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3385 delete (*cusparsestruct)->workVector; 3386 delete (*cusparsestruct)->rowoffsets_gpu; 3387 delete (*cusparsestruct)->cooPerm; 3388 delete (*cusparsestruct)->cooPerm_a; 3389 delete (*cusparsestruct)->csr2csc_i; 3390 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3391 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3392 } 3393 PetscFunctionReturn(0); 3394 } 3395 3396 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3397 { 3398 PetscFunctionBegin; 3399 if (*mat) { 3400 delete (*mat)->values; 3401 delete (*mat)->column_indices; 3402 delete (*mat)->row_offsets; 3403 delete *mat; 3404 *mat = 0; 3405 } 3406 PetscFunctionReturn(0); 3407 } 3408 3409 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3410 { 3411 cusparseStatus_t stat; 3412 PetscErrorCode ierr; 3413 3414 PetscFunctionBegin; 3415 if (*trifactor) { 3416 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3417 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3418 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3419 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3420 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3421 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3422 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3423 #endif 3424 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3425 } 3426 PetscFunctionReturn(0); 3427 } 3428 3429 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3430 { 3431 CsrMatrix *mat; 3432 cusparseStatus_t stat; 3433 cudaError_t err; 3434 3435 PetscFunctionBegin; 3436 if (*matstruct) { 3437 if ((*matstruct)->mat) { 3438 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3439 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3440 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3441 #else 3442 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3443 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3444 #endif 3445 } else { 3446 mat = (CsrMatrix*)(*matstruct)->mat; 3447 CsrMatrix_Destroy(&mat); 3448 } 3449 } 3450 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3451 delete (*matstruct)->cprowIndices; 3452 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3453 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3454 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3455 3456 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3457 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3458 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3459 for (int i=0; i<3; i++) { 3460 if (mdata->cuSpMV[i].initialized) { 3461 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3462 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3463 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3464 } 3465 } 3466 #endif 3467 delete *matstruct; 3468 *matstruct = NULL; 3469 } 3470 PetscFunctionReturn(0); 3471 } 3472 3473 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3474 { 3475 PetscErrorCode ierr; 3476 3477 PetscFunctionBegin; 3478 if (*trifactors) { 3479 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3480 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3481 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3482 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3483 delete (*trifactors)->rpermIndices; 3484 delete (*trifactors)->cpermIndices; 3485 delete (*trifactors)->workVector; 3486 (*trifactors)->rpermIndices = NULL; 3487 (*trifactors)->cpermIndices = NULL; 3488 (*trifactors)->workVector = NULL; 3489 if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3490 if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3491 (*trifactors)->init_dev_prop = PETSC_FALSE; 3492 } 3493 PetscFunctionReturn(0); 3494 } 3495 3496 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3497 { 3498 PetscErrorCode ierr; 3499 cusparseHandle_t handle; 3500 cusparseStatus_t stat; 3501 3502 PetscFunctionBegin; 3503 if (*trifactors) { 3504 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3505 if (handle = (*trifactors)->handle) { 3506 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3507 } 3508 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3509 } 3510 PetscFunctionReturn(0); 3511 } 3512 3513 struct IJCompare 3514 { 3515 __host__ __device__ 3516 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3517 { 3518 if (t1.get<0>() < t2.get<0>()) return true; 3519 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3520 return false; 3521 } 3522 }; 3523 3524 struct IJEqual 3525 { 3526 __host__ __device__ 3527 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3528 { 3529 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3530 return true; 3531 } 3532 }; 3533 3534 struct IJDiff 3535 { 3536 __host__ __device__ 3537 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3538 { 3539 return t1 == t2 ? 0 : 1; 3540 } 3541 }; 3542 3543 struct IJSum 3544 { 3545 __host__ __device__ 3546 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3547 { 3548 return t1||t2; 3549 } 3550 }; 3551 3552 #include <thrust/iterator/discard_iterator.h> 3553 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3554 { 3555 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3556 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3557 THRUSTARRAY *cooPerm_v = NULL; 3558 thrust::device_ptr<const PetscScalar> d_v; 3559 CsrMatrix *matrix; 3560 PetscErrorCode ierr; 3561 PetscInt n; 3562 3563 PetscFunctionBegin; 3564 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3565 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3566 if (!cusp->cooPerm) { 3567 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3568 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3569 PetscFunctionReturn(0); 3570 } 3571 matrix = (CsrMatrix*)cusp->mat->mat; 3572 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3573 if (!v) { 3574 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3575 goto finalize; 3576 } 3577 n = cusp->cooPerm->size(); 3578 if (isCudaMem(v)) { 3579 d_v = thrust::device_pointer_cast(v); 3580 } else { 3581 cooPerm_v = new THRUSTARRAY(n); 3582 cooPerm_v->assign(v,v+n); 3583 d_v = cooPerm_v->data(); 3584 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3585 } 3586 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3587 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3588 if (cusp->cooPerm_a) { 3589 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3590 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3591 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3592 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3593 delete cooPerm_w; 3594 } else { 3595 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3596 matrix->values->begin())); 3597 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3598 matrix->values->end())); 3599 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 3600 } 3601 } else { 3602 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3603 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3604 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3605 } else { 3606 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3607 matrix->values->begin())); 3608 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3609 matrix->values->end())); 3610 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3611 } 3612 } 3613 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3614 finalize: 3615 delete cooPerm_v; 3616 A->offloadmask = PETSC_OFFLOAD_GPU; 3617 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3618 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3619 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3620 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3621 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3622 a->reallocs = 0; 3623 A->info.mallocs += 0; 3624 A->info.nz_unneeded = 0; 3625 A->assembled = A->was_assembled = PETSC_TRUE; 3626 A->num_ass++; 3627 PetscFunctionReturn(0); 3628 } 3629 3630 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3631 { 3632 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3633 PetscErrorCode ierr; 3634 3635 PetscFunctionBegin; 3636 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3637 if (!cusp) PetscFunctionReturn(0); 3638 if (destroy) { 3639 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3640 delete cusp->csr2csc_i; 3641 cusp->csr2csc_i = NULL; 3642 } 3643 A->transupdated = PETSC_FALSE; 3644 PetscFunctionReturn(0); 3645 } 3646 3647 #include <thrust/binary_search.h> 3648 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3649 { 3650 PetscErrorCode ierr; 3651 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3652 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3653 PetscInt cooPerm_n, nzr = 0; 3654 cudaError_t cerr; 3655 3656 PetscFunctionBegin; 3657 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3658 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3659 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3660 if (n != cooPerm_n) { 3661 delete cusp->cooPerm; 3662 delete cusp->cooPerm_a; 3663 cusp->cooPerm = NULL; 3664 cusp->cooPerm_a = NULL; 3665 } 3666 if (n) { 3667 THRUSTINTARRAY d_i(n); 3668 THRUSTINTARRAY d_j(n); 3669 THRUSTINTARRAY ii(A->rmap->n); 3670 3671 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3672 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3673 3674 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3675 d_i.assign(coo_i,coo_i+n); 3676 d_j.assign(coo_j,coo_j+n); 3677 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3678 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3679 3680 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3681 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3682 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 3683 *cusp->cooPerm_a = d_i; 3684 THRUSTINTARRAY w = d_j; 3685 3686 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 3687 if (nekey == ekey) { /* all entries are unique */ 3688 delete cusp->cooPerm_a; 3689 cusp->cooPerm_a = NULL; 3690 } else { /* I couldn't come up with a more elegant algorithm */ 3691 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 3692 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 3693 (*cusp->cooPerm_a)[0] = 0; 3694 w[0] = 0; 3695 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 3696 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 3697 } 3698 thrust::counting_iterator<PetscInt> search_begin(0); 3699 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 3700 search_begin, search_begin + A->rmap->n, 3701 ii.begin()); 3702 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3703 3704 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3705 a->singlemalloc = PETSC_FALSE; 3706 a->free_a = PETSC_TRUE; 3707 a->free_ij = PETSC_TRUE; 3708 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3709 a->i[0] = 0; 3710 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3711 a->nz = a->maxnz = a->i[A->rmap->n]; 3712 a->rmax = 0; 3713 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3714 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3715 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3716 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3717 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3718 for (PetscInt i = 0; i < A->rmap->n; i++) { 3719 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3720 nzr += (PetscInt)!!(nnzr); 3721 a->ilen[i] = a->imax[i] = nnzr; 3722 a->rmax = PetscMax(a->rmax,nnzr); 3723 } 3724 a->nonzerorowcnt = nzr; 3725 A->preallocated = PETSC_TRUE; 3726 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3727 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3728 } else { 3729 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3730 } 3731 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3732 3733 /* We want to allocate the CUSPARSE struct for matvec now. 3734 The code is so convoluted now that I prefer to copy zeros */ 3735 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3736 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3737 A->offloadmask = PETSC_OFFLOAD_CPU; 3738 A->nonzerostate++; 3739 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3740 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 3741 3742 A->assembled = PETSC_FALSE; 3743 A->was_assembled = PETSC_FALSE; 3744 PetscFunctionReturn(0); 3745 } 3746 3747 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 3748 { 3749 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3750 CsrMatrix *csr; 3751 PetscErrorCode ierr; 3752 3753 PetscFunctionBegin; 3754 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3755 PetscValidPointer(a,2); 3756 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3757 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3758 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3759 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3760 csr = (CsrMatrix*)cusp->mat->mat; 3761 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3762 *a = csr->values->data().get(); 3763 PetscFunctionReturn(0); 3764 } 3765 3766 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 3767 { 3768 PetscFunctionBegin; 3769 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3770 PetscValidPointer(a,2); 3771 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3772 *a = NULL; 3773 PetscFunctionReturn(0); 3774 } 3775 3776 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 3777 { 3778 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3779 CsrMatrix *csr; 3780 PetscErrorCode ierr; 3781 3782 PetscFunctionBegin; 3783 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3784 PetscValidPointer(a,2); 3785 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3786 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3787 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3788 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3789 csr = (CsrMatrix*)cusp->mat->mat; 3790 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3791 *a = csr->values->data().get(); 3792 A->offloadmask = PETSC_OFFLOAD_GPU; 3793 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3794 PetscFunctionReturn(0); 3795 } 3796 3797 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 3798 { 3799 PetscErrorCode ierr; 3800 3801 PetscFunctionBegin; 3802 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3803 PetscValidPointer(a,2); 3804 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3805 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3806 *a = NULL; 3807 PetscFunctionReturn(0); 3808 } 3809 3810 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 3811 { 3812 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3813 CsrMatrix *csr; 3814 PetscErrorCode ierr; 3815 3816 PetscFunctionBegin; 3817 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3818 PetscValidPointer(a,2); 3819 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3820 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3821 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3822 csr = (CsrMatrix*)cusp->mat->mat; 3823 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3824 *a = csr->values->data().get(); 3825 A->offloadmask = PETSC_OFFLOAD_GPU; 3826 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3827 PetscFunctionReturn(0); 3828 } 3829 3830 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 3831 { 3832 PetscErrorCode ierr; 3833 3834 PetscFunctionBegin; 3835 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3836 PetscValidPointer(a,2); 3837 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3838 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3839 *a = NULL; 3840 PetscFunctionReturn(0); 3841 } 3842 3843 struct IJCompare4 3844 { 3845 __host__ __device__ 3846 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 3847 { 3848 if (t1.get<0>() < t2.get<0>()) return true; 3849 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3850 return false; 3851 } 3852 }; 3853 3854 struct Shift 3855 { 3856 int _shift; 3857 3858 Shift(int shift) : _shift(shift) {} 3859 __host__ __device__ 3860 inline int operator() (const int &c) 3861 { 3862 return c + _shift; 3863 } 3864 }; 3865 3866 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */ 3867 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 3868 { 3869 PetscErrorCode ierr; 3870 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 3871 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 3872 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 3873 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3874 PetscInt Annz,Bnnz; 3875 cusparseStatus_t stat; 3876 PetscInt i,m,n,zero = 0; 3877 cudaError_t cerr; 3878 3879 PetscFunctionBegin; 3880 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3881 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 3882 PetscValidPointer(C,4); 3883 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3884 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 3885 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 3886 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 3887 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3888 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3889 if (reuse == MAT_INITIAL_MATRIX) { 3890 m = A->rmap->n; 3891 n = A->cmap->n + B->cmap->n; 3892 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 3893 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 3894 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3895 c = (Mat_SeqAIJ*)(*C)->data; 3896 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3897 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3898 Ccsr = new CsrMatrix; 3899 Cmat->cprowIndices = NULL; 3900 c->compressedrow.use = PETSC_FALSE; 3901 c->compressedrow.nrows = 0; 3902 c->compressedrow.i = NULL; 3903 c->compressedrow.rindex = NULL; 3904 Ccusp->workVector = NULL; 3905 Ccusp->nrows = m; 3906 Ccusp->mat = Cmat; 3907 Ccusp->mat->mat = Ccsr; 3908 Ccsr->num_rows = m; 3909 Ccsr->num_cols = n; 3910 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 3911 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3912 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3913 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3914 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3915 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3916 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3917 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3918 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3919 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3920 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 3921 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 3922 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 3923 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3924 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3925 3926 Acsr = (CsrMatrix*)Acusp->mat->mat; 3927 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 3928 Annz = (PetscInt)Acsr->column_indices->size(); 3929 Bnnz = (PetscInt)Bcsr->column_indices->size(); 3930 c->nz = Annz + Bnnz; 3931 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 3932 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3933 Ccsr->values = new THRUSTARRAY(c->nz); 3934 Ccsr->num_entries = c->nz; 3935 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 3936 if (c->nz) { 3937 auto Acoo = new THRUSTINTARRAY32(Annz); 3938 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 3939 auto Ccoo = new THRUSTINTARRAY32(c->nz); 3940 THRUSTINTARRAY32 *Aroff,*Broff; 3941 3942 if (a->compressedrow.use) { /* need full row offset */ 3943 if (!Acusp->rowoffsets_gpu) { 3944 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 3945 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 3946 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3947 } 3948 Aroff = Acusp->rowoffsets_gpu; 3949 } else Aroff = Acsr->row_offsets; 3950 if (b->compressedrow.use) { /* need full row offset */ 3951 if (!Bcusp->rowoffsets_gpu) { 3952 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3953 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 3954 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3955 } 3956 Broff = Bcusp->rowoffsets_gpu; 3957 } else Broff = Bcsr->row_offsets; 3958 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3959 stat = cusparseXcsr2coo(Acusp->handle, 3960 Aroff->data().get(), 3961 Annz, 3962 m, 3963 Acoo->data().get(), 3964 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3965 stat = cusparseXcsr2coo(Bcusp->handle, 3966 Broff->data().get(), 3967 Bnnz, 3968 m, 3969 Bcoo->data().get(), 3970 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3971 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 3972 auto Aperm = thrust::make_constant_iterator(1); 3973 auto Bperm = thrust::make_constant_iterator(0); 3974 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 3975 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 3976 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 3977 #else 3978 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 3979 auto Bcib = Bcsr->column_indices->begin(); 3980 auto Bcie = Bcsr->column_indices->end(); 3981 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 3982 #endif 3983 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 3984 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 3985 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 3986 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 3987 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 3988 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 3989 auto p1 = Ccusp->cooPerm->begin(); 3990 auto p2 = Ccusp->cooPerm->begin(); 3991 thrust::advance(p2,Annz); 3992 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 3993 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 3994 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 3995 #endif 3996 auto cci = thrust::make_counting_iterator(zero); 3997 auto cce = thrust::make_counting_iterator(c->nz); 3998 #if 0 //Errors on SUMMIT cuda 11.1.0 3999 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4000 #else 4001 auto pred = thrust::identity<int>(); 4002 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4003 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4004 #endif 4005 stat = cusparseXcoo2csr(Ccusp->handle, 4006 Ccoo->data().get(), 4007 c->nz, 4008 m, 4009 Ccsr->row_offsets->data().get(), 4010 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4011 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4012 delete wPerm; 4013 delete Acoo; 4014 delete Bcoo; 4015 delete Ccoo; 4016 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4017 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4018 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4019 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4020 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4021 #endif 4022 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4023 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4024 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4025 CsrMatrix *CcsrT = new CsrMatrix; 4026 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4027 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4028 4029 (*C)->form_explicit_transpose = PETSC_TRUE; 4030 (*C)->transupdated = PETSC_TRUE; 4031 Ccusp->rowoffsets_gpu = NULL; 4032 CmatT->cprowIndices = NULL; 4033 CmatT->mat = CcsrT; 4034 CcsrT->num_rows = n; 4035 CcsrT->num_cols = m; 4036 CcsrT->num_entries = c->nz; 4037 4038 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4039 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4040 CcsrT->values = new THRUSTARRAY(c->nz); 4041 4042 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4043 auto rT = CcsrT->row_offsets->begin(); 4044 if (AT) { 4045 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4046 thrust::advance(rT,-1); 4047 } 4048 if (BT) { 4049 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4050 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4051 thrust::copy(titb,tite,rT); 4052 } 4053 auto cT = CcsrT->column_indices->begin(); 4054 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4055 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4056 auto vT = CcsrT->values->begin(); 4057 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4058 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4059 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4060 4061 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4062 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4063 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4064 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4065 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4066 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4067 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4068 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4069 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4070 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4071 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4072 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4073 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4074 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4075 #endif 4076 Ccusp->matTranspose = CmatT; 4077 } 4078 } 4079 4080 c->singlemalloc = PETSC_FALSE; 4081 c->free_a = PETSC_TRUE; 4082 c->free_ij = PETSC_TRUE; 4083 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4084 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4085 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4086 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4087 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4088 ii = *Ccsr->row_offsets; 4089 jj = *Ccsr->column_indices; 4090 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4091 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4092 } else { 4093 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4094 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4095 } 4096 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4097 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4098 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4099 c->maxnz = c->nz; 4100 c->nonzerorowcnt = 0; 4101 c->rmax = 0; 4102 for (i = 0; i < m; i++) { 4103 const PetscInt nn = c->i[i+1] - c->i[i]; 4104 c->ilen[i] = c->imax[i] = nn; 4105 c->nonzerorowcnt += (PetscInt)!!nn; 4106 c->rmax = PetscMax(c->rmax,nn); 4107 } 4108 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4109 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4110 (*C)->nonzerostate++; 4111 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4112 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4113 Ccusp->nonzerostate = (*C)->nonzerostate; 4114 (*C)->preallocated = PETSC_TRUE; 4115 } else { 4116 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4117 c = (Mat_SeqAIJ*)(*C)->data; 4118 if (c->nz) { 4119 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4120 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4121 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4122 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4123 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4124 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4125 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4126 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4127 Acsr = (CsrMatrix*)Acusp->mat->mat; 4128 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4129 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4130 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4131 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4132 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4133 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4134 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4135 auto pmid = Ccusp->cooPerm->begin(); 4136 thrust::advance(pmid,Acsr->num_entries); 4137 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4138 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4139 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4140 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4141 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4142 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4143 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4144 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4145 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4146 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4147 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4148 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4149 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4150 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4151 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4152 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4153 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4154 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4155 auto vT = CcsrT->values->begin(); 4156 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4157 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4158 (*C)->transupdated = PETSC_TRUE; 4159 } 4160 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4161 } 4162 } 4163 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4164 (*C)->assembled = PETSC_TRUE; 4165 (*C)->was_assembled = PETSC_FALSE; 4166 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4167 PetscFunctionReturn(0); 4168 } 4169 4170 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4171 { 4172 PetscErrorCode ierr; 4173 bool dmem; 4174 const PetscScalar *av; 4175 cudaError_t cerr; 4176 4177 PetscFunctionBegin; 4178 dmem = isCudaMem(v); 4179 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4180 if (n && idx) { 4181 THRUSTINTARRAY widx(n); 4182 widx.assign(idx,idx+n); 4183 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4184 4185 THRUSTARRAY *w = NULL; 4186 thrust::device_ptr<PetscScalar> dv; 4187 if (dmem) { 4188 dv = thrust::device_pointer_cast(v); 4189 } else { 4190 w = new THRUSTARRAY(n); 4191 dv = w->data(); 4192 } 4193 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4194 4195 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4196 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4197 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4198 if (w) { 4199 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4200 } 4201 delete w; 4202 } else { 4203 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4204 } 4205 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4206 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4207 PetscFunctionReturn(0); 4208 } 4209