1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/async/for_each.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 76 77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 82 83 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 85 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 86 87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 89 90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 91 92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 93 { 94 cusparseStatus_t stat; 95 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 96 97 PetscFunctionBegin; 98 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 99 cusparsestruct->stream = stream; 100 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 101 PetscFunctionReturn(0); 102 } 103 104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 105 { 106 cusparseStatus_t stat; 107 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 108 109 PetscFunctionBegin; 110 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 111 if (cusparsestruct->handle != handle) { 112 if (cusparsestruct->handle) { 113 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 114 } 115 cusparsestruct->handle = handle; 116 } 117 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 118 PetscFunctionReturn(0); 119 } 120 121 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 122 { 123 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 124 PetscBool flg; 125 PetscErrorCode ierr; 126 127 PetscFunctionBegin; 128 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 129 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 130 if (cusparsestruct->handle) cusparsestruct->handle = 0; 131 PetscFunctionReturn(0); 132 } 133 134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 135 { 136 PetscFunctionBegin; 137 *type = MATSOLVERCUSPARSE; 138 PetscFunctionReturn(0); 139 } 140 141 /*MC 142 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 143 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 144 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 145 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 146 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 147 algorithms are not recommended. This class does NOT support direct solver operations. 148 149 Level: beginner 150 151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 152 M*/ 153 154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 155 { 156 PetscErrorCode ierr; 157 PetscInt n = A->rmap->n; 158 159 PetscFunctionBegin; 160 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 161 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 162 (*B)->factortype = ftype; 163 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 164 165 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 166 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 167 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 168 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 169 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 170 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 171 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 172 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 173 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 174 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 175 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 176 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 177 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 178 179 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 180 (*B)->canuseordering = PETSC_TRUE; 181 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 182 PetscFunctionReturn(0); 183 } 184 185 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 186 { 187 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 188 189 PetscFunctionBegin; 190 switch (op) { 191 case MAT_CUSPARSE_MULT: 192 cusparsestruct->format = format; 193 break; 194 case MAT_CUSPARSE_ALL: 195 cusparsestruct->format = format; 196 break; 197 default: 198 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 199 } 200 PetscFunctionReturn(0); 201 } 202 203 /*@ 204 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 205 operation. Only the MatMult operation can use different GPU storage formats 206 for MPIAIJCUSPARSE matrices. 207 Not Collective 208 209 Input Parameters: 210 + A - Matrix of type SEQAIJCUSPARSE 211 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 212 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 213 214 Output Parameter: 215 216 Level: intermediate 217 218 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 219 @*/ 220 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 221 { 222 PetscErrorCode ierr; 223 224 PetscFunctionBegin; 225 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 226 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 227 PetscFunctionReturn(0); 228 } 229 230 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 231 { 232 PetscErrorCode ierr; 233 234 PetscFunctionBegin; 235 switch (op) { 236 case MAT_FORM_EXPLICIT_TRANSPOSE: 237 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 238 if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 239 A->form_explicit_transpose = flg; 240 break; 241 default: 242 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 243 break; 244 } 245 PetscFunctionReturn(0); 246 } 247 248 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 249 250 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 251 { 252 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 253 IS isrow = b->row,iscol = b->col; 254 PetscBool row_identity,col_identity; 255 PetscErrorCode ierr; 256 257 PetscFunctionBegin; 258 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 259 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 260 B->offloadmask = PETSC_OFFLOAD_CPU; 261 /* determine which version of MatSolve needs to be used. */ 262 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 263 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 264 if (row_identity && col_identity) { 265 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 266 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 267 B->ops->matsolve = NULL; 268 B->ops->matsolvetranspose = NULL; 269 } else { 270 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 271 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 272 B->ops->matsolve = NULL; 273 B->ops->matsolvetranspose = NULL; 274 } 275 276 /* get the triangular factors */ 277 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 278 PetscFunctionReturn(0); 279 } 280 281 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 282 { 283 PetscErrorCode ierr; 284 MatCUSPARSEStorageFormat format; 285 PetscBool flg; 286 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 287 288 PetscFunctionBegin; 289 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 290 if (A->factortype == MAT_FACTOR_NONE) { 291 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 292 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 293 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 294 295 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 296 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 297 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 298 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 299 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 300 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 301 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 302 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 303 304 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 305 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 306 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 307 308 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 309 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 310 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 311 #endif 312 } 313 ierr = PetscOptionsTail();CHKERRQ(ierr); 314 PetscFunctionReturn(0); 315 } 316 317 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 318 { 319 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 320 PetscErrorCode ierr; 321 322 PetscFunctionBegin; 323 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 324 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 325 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 326 PetscFunctionReturn(0); 327 } 328 329 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 330 { 331 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 332 PetscErrorCode ierr; 333 334 PetscFunctionBegin; 335 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 336 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 337 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 338 PetscFunctionReturn(0); 339 } 340 341 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 342 { 343 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 344 PetscErrorCode ierr; 345 346 PetscFunctionBegin; 347 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 348 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 349 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 350 PetscFunctionReturn(0); 351 } 352 353 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 354 { 355 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 356 PetscErrorCode ierr; 357 358 PetscFunctionBegin; 359 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 360 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 361 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 362 PetscFunctionReturn(0); 363 } 364 365 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 366 { 367 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 368 PetscInt n = A->rmap->n; 369 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 370 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 371 cusparseStatus_t stat; 372 const PetscInt *ai = a->i,*aj = a->j,*vi; 373 const MatScalar *aa = a->a,*v; 374 PetscInt *AiLo, *AjLo; 375 PetscInt i,nz, nzLower, offset, rowOffset; 376 PetscErrorCode ierr; 377 cudaError_t cerr; 378 379 PetscFunctionBegin; 380 if (!n) PetscFunctionReturn(0); 381 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 382 try { 383 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 384 nzLower=n+ai[n]-ai[1]; 385 if (!loTriFactor) { 386 PetscScalar *AALo; 387 388 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 389 390 /* Allocate Space for the lower triangular matrix */ 391 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 392 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 393 394 /* Fill the lower triangular matrix */ 395 AiLo[0] = (PetscInt) 0; 396 AiLo[n] = nzLower; 397 AjLo[0] = (PetscInt) 0; 398 AALo[0] = (MatScalar) 1.0; 399 v = aa; 400 vi = aj; 401 offset = 1; 402 rowOffset= 1; 403 for (i=1; i<n; i++) { 404 nz = ai[i+1] - ai[i]; 405 /* additional 1 for the term on the diagonal */ 406 AiLo[i] = rowOffset; 407 rowOffset += nz+1; 408 409 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 410 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 411 412 offset += nz; 413 AjLo[offset] = (PetscInt) i; 414 AALo[offset] = (MatScalar) 1.0; 415 offset += 1; 416 417 v += nz; 418 vi += nz; 419 } 420 421 /* allocate space for the triangular factor information */ 422 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 423 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 424 /* Create the matrix description */ 425 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 426 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 427 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 428 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 429 #else 430 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 431 #endif 432 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 433 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 434 435 /* set the operation */ 436 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 437 438 /* set the matrix */ 439 loTriFactor->csrMat = new CsrMatrix; 440 loTriFactor->csrMat->num_rows = n; 441 loTriFactor->csrMat->num_cols = n; 442 loTriFactor->csrMat->num_entries = nzLower; 443 444 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 445 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 446 447 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 448 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 449 450 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 451 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 452 453 /* Create the solve analysis information */ 454 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 455 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 456 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 457 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 458 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 459 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 460 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 461 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 462 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 463 #endif 464 465 /* perform the solve analysis */ 466 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 467 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 468 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 469 loTriFactor->csrMat->column_indices->data().get(), 470 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 471 loTriFactor->solveInfo, 472 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 473 #else 474 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 475 #endif 476 cerr = WaitForCUDA();CHKERRCUDA(cerr); 477 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 478 479 /* assign the pointer */ 480 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 481 loTriFactor->AA_h = AALo; 482 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 483 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 484 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 485 } else { /* update values only */ 486 if (!loTriFactor->AA_h) { 487 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 488 } 489 /* Fill the lower triangular matrix */ 490 loTriFactor->AA_h[0] = 1.0; 491 v = aa; 492 vi = aj; 493 offset = 1; 494 for (i=1; i<n; i++) { 495 nz = ai[i+1] - ai[i]; 496 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 497 offset += nz; 498 loTriFactor->AA_h[offset] = 1.0; 499 offset += 1; 500 v += nz; 501 } 502 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 503 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 504 } 505 } catch(char *ex) { 506 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 507 } 508 } 509 PetscFunctionReturn(0); 510 } 511 512 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 513 { 514 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 515 PetscInt n = A->rmap->n; 516 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 517 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 518 cusparseStatus_t stat; 519 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 520 const MatScalar *aa = a->a,*v; 521 PetscInt *AiUp, *AjUp; 522 PetscInt i,nz, nzUpper, offset; 523 PetscErrorCode ierr; 524 cudaError_t cerr; 525 526 PetscFunctionBegin; 527 if (!n) PetscFunctionReturn(0); 528 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 529 try { 530 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 531 nzUpper = adiag[0]-adiag[n]; 532 if (!upTriFactor) { 533 PetscScalar *AAUp; 534 535 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 536 537 /* Allocate Space for the upper triangular matrix */ 538 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 539 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 540 541 /* Fill the upper triangular matrix */ 542 AiUp[0]=(PetscInt) 0; 543 AiUp[n]=nzUpper; 544 offset = nzUpper; 545 for (i=n-1; i>=0; i--) { 546 v = aa + adiag[i+1] + 1; 547 vi = aj + adiag[i+1] + 1; 548 549 /* number of elements NOT on the diagonal */ 550 nz = adiag[i] - adiag[i+1]-1; 551 552 /* decrement the offset */ 553 offset -= (nz+1); 554 555 /* first, set the diagonal elements */ 556 AjUp[offset] = (PetscInt) i; 557 AAUp[offset] = (MatScalar)1./v[nz]; 558 AiUp[i] = AiUp[i+1] - (nz+1); 559 560 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 561 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 562 } 563 564 /* allocate space for the triangular factor information */ 565 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 566 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 567 568 /* Create the matrix description */ 569 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 570 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 571 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 572 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 573 #else 574 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 575 #endif 576 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 577 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 578 579 /* set the operation */ 580 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 581 582 /* set the matrix */ 583 upTriFactor->csrMat = new CsrMatrix; 584 upTriFactor->csrMat->num_rows = n; 585 upTriFactor->csrMat->num_cols = n; 586 upTriFactor->csrMat->num_entries = nzUpper; 587 588 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 589 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 590 591 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 592 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 593 594 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 595 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 596 597 /* Create the solve analysis information */ 598 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 599 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 600 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 601 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 602 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 603 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 604 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 605 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 606 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 607 #endif 608 609 /* perform the solve analysis */ 610 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 611 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 612 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 613 upTriFactor->csrMat->column_indices->data().get(), 614 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 615 upTriFactor->solveInfo, 616 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 617 #else 618 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 619 #endif 620 cerr = WaitForCUDA();CHKERRCUDA(cerr); 621 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 622 623 /* assign the pointer */ 624 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 625 upTriFactor->AA_h = AAUp; 626 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 627 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 628 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 629 } else { 630 if (!upTriFactor->AA_h) { 631 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 632 } 633 /* Fill the upper triangular matrix */ 634 offset = nzUpper; 635 for (i=n-1; i>=0; i--) { 636 v = aa + adiag[i+1] + 1; 637 638 /* number of elements NOT on the diagonal */ 639 nz = adiag[i] - adiag[i+1]-1; 640 641 /* decrement the offset */ 642 offset -= (nz+1); 643 644 /* first, set the diagonal elements */ 645 upTriFactor->AA_h[offset] = 1./v[nz]; 646 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 647 } 648 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 649 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 650 } 651 } catch(char *ex) { 652 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 653 } 654 } 655 PetscFunctionReturn(0); 656 } 657 658 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 659 { 660 PetscErrorCode ierr; 661 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 662 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 663 IS isrow = a->row,iscol = a->icol; 664 PetscBool row_identity,col_identity; 665 PetscInt n = A->rmap->n; 666 667 PetscFunctionBegin; 668 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 669 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 670 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 671 672 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 673 cusparseTriFactors->nnz=a->nz; 674 675 A->offloadmask = PETSC_OFFLOAD_BOTH; 676 /* lower triangular indices */ 677 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 678 if (!row_identity && !cusparseTriFactors->rpermIndices) { 679 const PetscInt *r; 680 681 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 682 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 683 cusparseTriFactors->rpermIndices->assign(r, r+n); 684 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 685 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 686 } 687 688 /* upper triangular indices */ 689 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 690 if (!col_identity && !cusparseTriFactors->cpermIndices) { 691 const PetscInt *c; 692 693 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 694 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 695 cusparseTriFactors->cpermIndices->assign(c, c+n); 696 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 697 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 698 } 699 PetscFunctionReturn(0); 700 } 701 702 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 703 { 704 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 705 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 706 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 707 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 708 cusparseStatus_t stat; 709 PetscErrorCode ierr; 710 cudaError_t cerr; 711 PetscInt *AiUp, *AjUp; 712 PetscScalar *AAUp; 713 PetscScalar *AALo; 714 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 715 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 716 const PetscInt *ai = b->i,*aj = b->j,*vj; 717 const MatScalar *aa = b->a,*v; 718 719 PetscFunctionBegin; 720 if (!n) PetscFunctionReturn(0); 721 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 722 try { 723 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 724 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 725 if (!upTriFactor && !loTriFactor) { 726 /* Allocate Space for the upper triangular matrix */ 727 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 728 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 729 730 /* Fill the upper triangular matrix */ 731 AiUp[0]=(PetscInt) 0; 732 AiUp[n]=nzUpper; 733 offset = 0; 734 for (i=0; i<n; i++) { 735 /* set the pointers */ 736 v = aa + ai[i]; 737 vj = aj + ai[i]; 738 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 739 740 /* first, set the diagonal elements */ 741 AjUp[offset] = (PetscInt) i; 742 AAUp[offset] = (MatScalar)1.0/v[nz]; 743 AiUp[i] = offset; 744 AALo[offset] = (MatScalar)1.0/v[nz]; 745 746 offset+=1; 747 if (nz>0) { 748 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 749 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 750 for (j=offset; j<offset+nz; j++) { 751 AAUp[j] = -AAUp[j]; 752 AALo[j] = AAUp[j]/v[nz]; 753 } 754 offset+=nz; 755 } 756 } 757 758 /* allocate space for the triangular factor information */ 759 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 760 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 761 762 /* Create the matrix description */ 763 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 764 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 765 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 766 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 767 #else 768 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 769 #endif 770 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 771 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 772 773 /* set the matrix */ 774 upTriFactor->csrMat = new CsrMatrix; 775 upTriFactor->csrMat->num_rows = A->rmap->n; 776 upTriFactor->csrMat->num_cols = A->cmap->n; 777 upTriFactor->csrMat->num_entries = a->nz; 778 779 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 780 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 781 782 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 783 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 784 785 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 786 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 787 788 /* set the operation */ 789 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 790 791 /* Create the solve analysis information */ 792 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 793 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 794 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 795 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 796 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 797 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 798 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 799 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 800 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 801 #endif 802 803 /* perform the solve analysis */ 804 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 805 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 806 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 807 upTriFactor->csrMat->column_indices->data().get(), 808 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 809 upTriFactor->solveInfo, 810 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 811 #else 812 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 813 #endif 814 cerr = WaitForCUDA();CHKERRCUDA(cerr); 815 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 816 817 /* assign the pointer */ 818 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 819 820 /* allocate space for the triangular factor information */ 821 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 822 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 823 824 /* Create the matrix description */ 825 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 826 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 827 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 828 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 829 #else 830 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 831 #endif 832 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 833 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 834 835 /* set the operation */ 836 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 837 838 /* set the matrix */ 839 loTriFactor->csrMat = new CsrMatrix; 840 loTriFactor->csrMat->num_rows = A->rmap->n; 841 loTriFactor->csrMat->num_cols = A->cmap->n; 842 loTriFactor->csrMat->num_entries = a->nz; 843 844 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 845 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 846 847 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 848 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 849 850 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 851 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 852 853 /* Create the solve analysis information */ 854 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 855 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 856 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 857 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 858 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 859 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 860 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 861 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 862 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 863 #endif 864 865 /* perform the solve analysis */ 866 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 867 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 868 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 869 loTriFactor->csrMat->column_indices->data().get(), 870 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 871 loTriFactor->solveInfo, 872 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 873 #else 874 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 875 #endif 876 cerr = WaitForCUDA();CHKERRCUDA(cerr); 877 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 878 879 /* assign the pointer */ 880 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 881 882 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 883 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 884 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 885 } else { 886 /* Fill the upper triangular matrix */ 887 offset = 0; 888 for (i=0; i<n; i++) { 889 /* set the pointers */ 890 v = aa + ai[i]; 891 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 892 893 /* first, set the diagonal elements */ 894 AAUp[offset] = 1.0/v[nz]; 895 AALo[offset] = 1.0/v[nz]; 896 897 offset+=1; 898 if (nz>0) { 899 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 900 for (j=offset; j<offset+nz; j++) { 901 AAUp[j] = -AAUp[j]; 902 AALo[j] = AAUp[j]/v[nz]; 903 } 904 offset+=nz; 905 } 906 } 907 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 908 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 909 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 910 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 911 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 912 } 913 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 914 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 915 } catch(char *ex) { 916 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 917 } 918 } 919 PetscFunctionReturn(0); 920 } 921 922 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 923 { 924 PetscErrorCode ierr; 925 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 926 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 927 IS ip = a->row; 928 PetscBool perm_identity; 929 PetscInt n = A->rmap->n; 930 931 PetscFunctionBegin; 932 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 933 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 934 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 935 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 936 937 A->offloadmask = PETSC_OFFLOAD_BOTH; 938 939 /* lower triangular indices */ 940 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 941 if (!perm_identity) { 942 IS iip; 943 const PetscInt *irip,*rip; 944 945 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 946 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 947 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 948 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 949 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 950 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 951 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 952 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 953 ierr = ISDestroy(&iip);CHKERRQ(ierr); 954 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 955 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 956 } 957 PetscFunctionReturn(0); 958 } 959 960 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 961 { 962 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 963 IS ip = b->row; 964 PetscBool perm_identity; 965 PetscErrorCode ierr; 966 967 PetscFunctionBegin; 968 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 969 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 970 B->offloadmask = PETSC_OFFLOAD_CPU; 971 /* determine which version of MatSolve needs to be used. */ 972 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 973 if (perm_identity) { 974 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 975 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 976 B->ops->matsolve = NULL; 977 B->ops->matsolvetranspose = NULL; 978 } else { 979 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 980 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 981 B->ops->matsolve = NULL; 982 B->ops->matsolvetranspose = NULL; 983 } 984 985 /* get the triangular factors */ 986 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 987 PetscFunctionReturn(0); 988 } 989 990 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 991 { 992 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 993 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 994 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 995 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 996 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 997 cusparseStatus_t stat; 998 cusparseIndexBase_t indexBase; 999 cusparseMatrixType_t matrixType; 1000 cusparseFillMode_t fillMode; 1001 cusparseDiagType_t diagType; 1002 cudaError_t cerr; 1003 PetscErrorCode ierr; 1004 1005 PetscFunctionBegin; 1006 /* allocate space for the transpose of the lower triangular factor */ 1007 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1008 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1009 1010 /* set the matrix descriptors of the lower triangular factor */ 1011 matrixType = cusparseGetMatType(loTriFactor->descr); 1012 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1013 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1014 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1015 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1016 1017 /* Create the matrix description */ 1018 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1019 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1020 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1021 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1022 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1023 1024 /* set the operation */ 1025 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1026 1027 /* allocate GPU space for the CSC of the lower triangular factor*/ 1028 loTriFactorT->csrMat = new CsrMatrix; 1029 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1030 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1031 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1032 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1033 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1034 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1035 1036 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1037 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1038 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1039 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1040 loTriFactor->csrMat->values->data().get(), 1041 loTriFactor->csrMat->row_offsets->data().get(), 1042 loTriFactor->csrMat->column_indices->data().get(), 1043 loTriFactorT->csrMat->values->data().get(), 1044 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1045 CUSPARSE_ACTION_NUMERIC,indexBase, 1046 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1047 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1048 #endif 1049 1050 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1051 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1052 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1053 loTriFactor->csrMat->values->data().get(), 1054 loTriFactor->csrMat->row_offsets->data().get(), 1055 loTriFactor->csrMat->column_indices->data().get(), 1056 loTriFactorT->csrMat->values->data().get(), 1057 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1058 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1059 CUSPARSE_ACTION_NUMERIC, indexBase, 1060 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1061 #else 1062 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1063 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1064 #endif 1065 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1066 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1067 1068 /* Create the solve analysis information */ 1069 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1070 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1071 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1072 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1073 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1074 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1075 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1076 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1077 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1078 #endif 1079 1080 /* perform the solve analysis */ 1081 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1082 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1083 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1084 loTriFactorT->csrMat->column_indices->data().get(), 1085 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1086 loTriFactorT->solveInfo, 1087 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1088 #else 1089 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1090 #endif 1091 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1092 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1093 1094 /* assign the pointer */ 1095 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1096 1097 /*********************************************/ 1098 /* Now the Transpose of the Upper Tri Factor */ 1099 /*********************************************/ 1100 1101 /* allocate space for the transpose of the upper triangular factor */ 1102 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1103 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1104 1105 /* set the matrix descriptors of the upper triangular factor */ 1106 matrixType = cusparseGetMatType(upTriFactor->descr); 1107 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1108 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1109 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1110 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1111 1112 /* Create the matrix description */ 1113 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1114 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1115 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1116 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1117 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1118 1119 /* set the operation */ 1120 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1121 1122 /* allocate GPU space for the CSC of the upper triangular factor*/ 1123 upTriFactorT->csrMat = new CsrMatrix; 1124 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1125 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1126 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1127 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1128 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1129 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1130 1131 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1132 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1133 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1134 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1135 upTriFactor->csrMat->values->data().get(), 1136 upTriFactor->csrMat->row_offsets->data().get(), 1137 upTriFactor->csrMat->column_indices->data().get(), 1138 upTriFactorT->csrMat->values->data().get(), 1139 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1140 CUSPARSE_ACTION_NUMERIC,indexBase, 1141 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1142 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1143 #endif 1144 1145 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1146 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1147 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1148 upTriFactor->csrMat->values->data().get(), 1149 upTriFactor->csrMat->row_offsets->data().get(), 1150 upTriFactor->csrMat->column_indices->data().get(), 1151 upTriFactorT->csrMat->values->data().get(), 1152 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1153 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1154 CUSPARSE_ACTION_NUMERIC, indexBase, 1155 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1156 #else 1157 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1158 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1159 #endif 1160 1161 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1162 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1163 1164 /* Create the solve analysis information */ 1165 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1166 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1167 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1168 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1169 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1170 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1171 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1172 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1173 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1174 #endif 1175 1176 /* perform the solve analysis */ 1177 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1178 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1179 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1180 upTriFactorT->csrMat->column_indices->data().get(), 1181 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1182 upTriFactorT->solveInfo, 1183 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1184 #else 1185 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1186 #endif 1187 1188 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1189 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1190 1191 /* assign the pointer */ 1192 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1193 PetscFunctionReturn(0); 1194 } 1195 1196 struct PetscScalarToPetscInt 1197 { 1198 __host__ __device__ 1199 PetscInt operator()(PetscScalar s) 1200 { 1201 return (PetscInt)PetscRealPart(s); 1202 } 1203 }; 1204 1205 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A) 1206 { 1207 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1208 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1209 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1210 cusparseStatus_t stat; 1211 cusparseIndexBase_t indexBase; 1212 cudaError_t err; 1213 PetscErrorCode ierr; 1214 1215 PetscFunctionBegin; 1216 if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 1217 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1218 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1219 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1220 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1221 if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1222 if (A->transupdated) PetscFunctionReturn(0); 1223 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1224 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1225 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1226 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1227 } 1228 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1229 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1230 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1231 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1232 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1233 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1234 1235 /* set alpha and beta */ 1236 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1237 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1238 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1239 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1240 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1241 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1242 1243 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1244 CsrMatrix *matrixT = new CsrMatrix; 1245 matstructT->mat = matrixT; 1246 matrixT->num_rows = A->cmap->n; 1247 matrixT->num_cols = A->rmap->n; 1248 matrixT->num_entries = a->nz; 1249 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1250 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1251 matrixT->values = new THRUSTARRAY(a->nz); 1252 1253 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1254 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1255 1256 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1257 stat = cusparseCreateCsr(&matstructT->matDescr, 1258 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1259 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1260 matrixT->values->data().get(), 1261 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1262 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1263 #endif 1264 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1265 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1266 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1267 #else 1268 CsrMatrix *temp = new CsrMatrix; 1269 CsrMatrix *tempT = new CsrMatrix; 1270 /* First convert HYB to CSR */ 1271 temp->num_rows = A->rmap->n; 1272 temp->num_cols = A->cmap->n; 1273 temp->num_entries = a->nz; 1274 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1275 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1276 temp->values = new THRUSTARRAY(a->nz); 1277 1278 stat = cusparse_hyb2csr(cusparsestruct->handle, 1279 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1280 temp->values->data().get(), 1281 temp->row_offsets->data().get(), 1282 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1283 1284 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1285 tempT->num_rows = A->rmap->n; 1286 tempT->num_cols = A->cmap->n; 1287 tempT->num_entries = a->nz; 1288 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1289 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1290 tempT->values = new THRUSTARRAY(a->nz); 1291 1292 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1293 temp->num_cols, temp->num_entries, 1294 temp->values->data().get(), 1295 temp->row_offsets->data().get(), 1296 temp->column_indices->data().get(), 1297 tempT->values->data().get(), 1298 tempT->column_indices->data().get(), 1299 tempT->row_offsets->data().get(), 1300 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1301 1302 /* Last, convert CSC to HYB */ 1303 cusparseHybMat_t hybMat; 1304 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1305 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1306 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1307 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1308 matstructT->descr, tempT->values->data().get(), 1309 tempT->row_offsets->data().get(), 1310 tempT->column_indices->data().get(), 1311 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1312 1313 /* assign the pointer */ 1314 matstructT->mat = hybMat; 1315 A->transupdated = PETSC_TRUE; 1316 /* delete temporaries */ 1317 if (tempT) { 1318 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1319 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1320 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1321 delete (CsrMatrix*) tempT; 1322 } 1323 if (temp) { 1324 if (temp->values) delete (THRUSTARRAY*) temp->values; 1325 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1326 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1327 delete (CsrMatrix*) temp; 1328 } 1329 #endif 1330 } 1331 } 1332 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1333 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1334 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1335 if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1336 if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1337 if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1338 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1339 if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1340 if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1341 if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1342 if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1343 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1344 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1345 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1346 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1347 } 1348 if (!cusparsestruct->csr2csc_i) { 1349 THRUSTARRAY csr2csc_a(matrix->num_entries); 1350 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1351 1352 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1353 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1354 void *csr2cscBuffer; 1355 size_t csr2cscBufferSize; 1356 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1357 A->cmap->n, matrix->num_entries, 1358 matrix->values->data().get(), 1359 cusparsestruct->rowoffsets_gpu->data().get(), 1360 matrix->column_indices->data().get(), 1361 matrixT->values->data().get(), 1362 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1363 CUSPARSE_ACTION_NUMERIC,indexBase, 1364 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1365 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1366 #endif 1367 1368 if (matrix->num_entries) { 1369 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1370 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1371 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1372 1373 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1374 should be filled with indexBase. So I just take a shortcut here. 1375 */ 1376 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1377 A->cmap->n,matrix->num_entries, 1378 csr2csc_a.data().get(), 1379 cusparsestruct->rowoffsets_gpu->data().get(), 1380 matrix->column_indices->data().get(), 1381 matrixT->values->data().get(), 1382 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1383 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1384 CUSPARSE_ACTION_NUMERIC,indexBase, 1385 cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1386 #else 1387 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1388 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1389 #endif 1390 } else { 1391 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1392 } 1393 1394 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1395 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1396 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1397 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1398 #endif 1399 } 1400 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1401 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1402 matrixT->values->begin())); 1403 } 1404 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1405 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1406 /* the compressed row indices is not used for matTranspose */ 1407 matstructT->cprowIndices = NULL; 1408 /* assign the pointer */ 1409 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1410 A->transupdated = PETSC_TRUE; 1411 PetscFunctionReturn(0); 1412 } 1413 1414 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1415 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1416 { 1417 PetscInt n = xx->map->n; 1418 const PetscScalar *barray; 1419 PetscScalar *xarray; 1420 thrust::device_ptr<const PetscScalar> bGPU; 1421 thrust::device_ptr<PetscScalar> xGPU; 1422 cusparseStatus_t stat; 1423 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1424 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1425 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1426 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1427 PetscErrorCode ierr; 1428 1429 PetscFunctionBegin; 1430 /* Analyze the matrix and create the transpose ... on the fly */ 1431 if (!loTriFactorT && !upTriFactorT) { 1432 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1433 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1434 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1435 } 1436 1437 /* Get the GPU pointers */ 1438 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1439 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1440 xGPU = thrust::device_pointer_cast(xarray); 1441 bGPU = thrust::device_pointer_cast(barray); 1442 1443 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1444 /* First, reorder with the row permutation */ 1445 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1446 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1447 xGPU); 1448 1449 /* First, solve U */ 1450 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1451 upTriFactorT->csrMat->num_rows, 1452 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1453 upTriFactorT->csrMat->num_entries, 1454 #endif 1455 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1456 upTriFactorT->csrMat->values->data().get(), 1457 upTriFactorT->csrMat->row_offsets->data().get(), 1458 upTriFactorT->csrMat->column_indices->data().get(), 1459 upTriFactorT->solveInfo, 1460 xarray, 1461 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1462 tempGPU->data().get(), 1463 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1464 #else 1465 tempGPU->data().get());CHKERRCUSPARSE(stat); 1466 #endif 1467 1468 /* Then, solve L */ 1469 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1470 loTriFactorT->csrMat->num_rows, 1471 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1472 loTriFactorT->csrMat->num_entries, 1473 #endif 1474 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1475 loTriFactorT->csrMat->values->data().get(), 1476 loTriFactorT->csrMat->row_offsets->data().get(), 1477 loTriFactorT->csrMat->column_indices->data().get(), 1478 loTriFactorT->solveInfo, 1479 tempGPU->data().get(), 1480 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1481 xarray, 1482 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1483 #else 1484 xarray);CHKERRCUSPARSE(stat); 1485 #endif 1486 1487 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1488 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1489 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1490 tempGPU->begin()); 1491 1492 /* Copy the temporary to the full solution. */ 1493 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1494 1495 /* restore */ 1496 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1497 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1498 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1499 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1500 PetscFunctionReturn(0); 1501 } 1502 1503 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1504 { 1505 const PetscScalar *barray; 1506 PetscScalar *xarray; 1507 cusparseStatus_t stat; 1508 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1509 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1510 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1511 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1512 PetscErrorCode ierr; 1513 1514 PetscFunctionBegin; 1515 /* Analyze the matrix and create the transpose ... on the fly */ 1516 if (!loTriFactorT && !upTriFactorT) { 1517 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1518 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1519 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1520 } 1521 1522 /* Get the GPU pointers */ 1523 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1524 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1525 1526 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1527 /* First, solve U */ 1528 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1529 upTriFactorT->csrMat->num_rows, 1530 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1531 upTriFactorT->csrMat->num_entries, 1532 #endif 1533 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1534 upTriFactorT->csrMat->values->data().get(), 1535 upTriFactorT->csrMat->row_offsets->data().get(), 1536 upTriFactorT->csrMat->column_indices->data().get(), 1537 upTriFactorT->solveInfo, 1538 barray, 1539 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1540 tempGPU->data().get(), 1541 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1542 #else 1543 tempGPU->data().get());CHKERRCUSPARSE(stat); 1544 #endif 1545 1546 /* Then, solve L */ 1547 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1548 loTriFactorT->csrMat->num_rows, 1549 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1550 loTriFactorT->csrMat->num_entries, 1551 #endif 1552 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1553 loTriFactorT->csrMat->values->data().get(), 1554 loTriFactorT->csrMat->row_offsets->data().get(), 1555 loTriFactorT->csrMat->column_indices->data().get(), 1556 loTriFactorT->solveInfo, 1557 tempGPU->data().get(), 1558 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1559 xarray, 1560 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1561 #else 1562 xarray);CHKERRCUSPARSE(stat); 1563 #endif 1564 1565 /* restore */ 1566 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1567 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1568 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1569 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1570 PetscFunctionReturn(0); 1571 } 1572 1573 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1574 { 1575 const PetscScalar *barray; 1576 PetscScalar *xarray; 1577 thrust::device_ptr<const PetscScalar> bGPU; 1578 thrust::device_ptr<PetscScalar> xGPU; 1579 cusparseStatus_t stat; 1580 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1581 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1582 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1583 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1584 PetscErrorCode ierr; 1585 1586 PetscFunctionBegin; 1587 1588 /* Get the GPU pointers */ 1589 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1590 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1591 xGPU = thrust::device_pointer_cast(xarray); 1592 bGPU = thrust::device_pointer_cast(barray); 1593 1594 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1595 /* First, reorder with the row permutation */ 1596 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1597 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1598 tempGPU->begin()); 1599 1600 /* Next, solve L */ 1601 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1602 loTriFactor->csrMat->num_rows, 1603 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1604 loTriFactor->csrMat->num_entries, 1605 #endif 1606 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1607 loTriFactor->csrMat->values->data().get(), 1608 loTriFactor->csrMat->row_offsets->data().get(), 1609 loTriFactor->csrMat->column_indices->data().get(), 1610 loTriFactor->solveInfo, 1611 tempGPU->data().get(), 1612 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1613 xarray, 1614 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1615 #else 1616 xarray);CHKERRCUSPARSE(stat); 1617 #endif 1618 1619 /* Then, solve U */ 1620 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1621 upTriFactor->csrMat->num_rows, 1622 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1623 upTriFactor->csrMat->num_entries, 1624 #endif 1625 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1626 upTriFactor->csrMat->values->data().get(), 1627 upTriFactor->csrMat->row_offsets->data().get(), 1628 upTriFactor->csrMat->column_indices->data().get(), 1629 upTriFactor->solveInfo,xarray, 1630 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1631 tempGPU->data().get(), 1632 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1633 #else 1634 tempGPU->data().get());CHKERRCUSPARSE(stat); 1635 #endif 1636 1637 /* Last, reorder with the column permutation */ 1638 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1639 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1640 xGPU); 1641 1642 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1643 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1644 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1645 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1646 PetscFunctionReturn(0); 1647 } 1648 1649 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1650 { 1651 const PetscScalar *barray; 1652 PetscScalar *xarray; 1653 cusparseStatus_t stat; 1654 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1655 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1656 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1657 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1658 PetscErrorCode ierr; 1659 1660 PetscFunctionBegin; 1661 /* Get the GPU pointers */ 1662 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1663 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1664 1665 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1666 /* First, solve L */ 1667 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1668 loTriFactor->csrMat->num_rows, 1669 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1670 loTriFactor->csrMat->num_entries, 1671 #endif 1672 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1673 loTriFactor->csrMat->values->data().get(), 1674 loTriFactor->csrMat->row_offsets->data().get(), 1675 loTriFactor->csrMat->column_indices->data().get(), 1676 loTriFactor->solveInfo, 1677 barray, 1678 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1679 tempGPU->data().get(), 1680 loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1681 #else 1682 tempGPU->data().get());CHKERRCUSPARSE(stat); 1683 #endif 1684 1685 /* Next, solve U */ 1686 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1687 upTriFactor->csrMat->num_rows, 1688 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1689 upTriFactor->csrMat->num_entries, 1690 #endif 1691 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1692 upTriFactor->csrMat->values->data().get(), 1693 upTriFactor->csrMat->row_offsets->data().get(), 1694 upTriFactor->csrMat->column_indices->data().get(), 1695 upTriFactor->solveInfo, 1696 tempGPU->data().get(), 1697 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1698 xarray, 1699 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1700 #else 1701 xarray);CHKERRCUSPARSE(stat); 1702 #endif 1703 1704 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1705 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1706 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1707 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1708 PetscFunctionReturn(0); 1709 } 1710 1711 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1712 { 1713 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1714 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1715 cudaError_t cerr; 1716 PetscErrorCode ierr; 1717 1718 PetscFunctionBegin; 1719 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1720 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1721 1722 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1723 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1724 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1725 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1726 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1727 A->offloadmask = PETSC_OFFLOAD_BOTH; 1728 } 1729 PetscFunctionReturn(0); 1730 } 1731 1732 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1733 { 1734 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1735 PetscErrorCode ierr; 1736 1737 PetscFunctionBegin; 1738 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1739 *array = a->a; 1740 A->offloadmask = PETSC_OFFLOAD_CPU; 1741 PetscFunctionReturn(0); 1742 } 1743 1744 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1745 { 1746 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1747 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1748 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1749 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1750 PetscErrorCode ierr; 1751 cusparseStatus_t stat; 1752 PetscBool both = PETSC_TRUE; 1753 cudaError_t err; 1754 1755 PetscFunctionBegin; 1756 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1757 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1758 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1759 CsrMatrix *matrix; 1760 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1761 1762 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1763 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1764 matrix->values->assign(a->a, a->a+a->nz); 1765 err = WaitForCUDA();CHKERRCUDA(err); 1766 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1767 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1768 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1769 } else { 1770 PetscInt nnz; 1771 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1772 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1773 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1774 delete cusparsestruct->workVector; 1775 delete cusparsestruct->rowoffsets_gpu; 1776 cusparsestruct->workVector = NULL; 1777 cusparsestruct->rowoffsets_gpu = NULL; 1778 try { 1779 if (a->compressedrow.use) { 1780 m = a->compressedrow.nrows; 1781 ii = a->compressedrow.i; 1782 ridx = a->compressedrow.rindex; 1783 } else { 1784 m = A->rmap->n; 1785 ii = a->i; 1786 ridx = NULL; 1787 } 1788 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1789 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1790 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1791 else nnz = a->nz; 1792 1793 /* create cusparse matrix */ 1794 cusparsestruct->nrows = m; 1795 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1796 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1797 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1798 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1799 1800 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1801 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1802 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1803 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1804 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1805 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1806 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1807 1808 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1809 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1810 /* set the matrix */ 1811 CsrMatrix *mat= new CsrMatrix; 1812 mat->num_rows = m; 1813 mat->num_cols = A->cmap->n; 1814 mat->num_entries = nnz; 1815 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1816 mat->row_offsets->assign(ii, ii + m+1); 1817 1818 mat->column_indices = new THRUSTINTARRAY32(nnz); 1819 mat->column_indices->assign(a->j, a->j+nnz); 1820 1821 mat->values = new THRUSTARRAY(nnz); 1822 if (a->a) mat->values->assign(a->a, a->a+nnz); 1823 1824 /* assign the pointer */ 1825 matstruct->mat = mat; 1826 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1827 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1828 stat = cusparseCreateCsr(&matstruct->matDescr, 1829 mat->num_rows, mat->num_cols, mat->num_entries, 1830 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1831 mat->values->data().get(), 1832 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1833 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1834 } 1835 #endif 1836 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1837 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1838 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1839 #else 1840 CsrMatrix *mat= new CsrMatrix; 1841 mat->num_rows = m; 1842 mat->num_cols = A->cmap->n; 1843 mat->num_entries = nnz; 1844 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1845 mat->row_offsets->assign(ii, ii + m+1); 1846 1847 mat->column_indices = new THRUSTINTARRAY32(nnz); 1848 mat->column_indices->assign(a->j, a->j+nnz); 1849 1850 mat->values = new THRUSTARRAY(nnz); 1851 if (a->a) mat->values->assign(a->a, a->a+nnz); 1852 1853 cusparseHybMat_t hybMat; 1854 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1855 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1856 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1857 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1858 matstruct->descr, mat->values->data().get(), 1859 mat->row_offsets->data().get(), 1860 mat->column_indices->data().get(), 1861 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1862 /* assign the pointer */ 1863 matstruct->mat = hybMat; 1864 1865 if (mat) { 1866 if (mat->values) delete (THRUSTARRAY*)mat->values; 1867 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1868 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1869 delete (CsrMatrix*)mat; 1870 } 1871 #endif 1872 } 1873 1874 /* assign the compressed row indices */ 1875 if (a->compressedrow.use) { 1876 cusparsestruct->workVector = new THRUSTARRAY(m); 1877 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1878 matstruct->cprowIndices->assign(ridx,ridx+m); 1879 tmp = m; 1880 } else { 1881 cusparsestruct->workVector = NULL; 1882 matstruct->cprowIndices = NULL; 1883 tmp = 0; 1884 } 1885 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1886 1887 /* assign the pointer */ 1888 cusparsestruct->mat = matstruct; 1889 } catch(char *ex) { 1890 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1891 } 1892 err = WaitForCUDA();CHKERRCUDA(err); 1893 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1894 cusparsestruct->nonzerostate = A->nonzerostate; 1895 } 1896 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1897 } 1898 PetscFunctionReturn(0); 1899 } 1900 1901 struct VecCUDAPlusEquals 1902 { 1903 template <typename Tuple> 1904 __host__ __device__ 1905 void operator()(Tuple t) 1906 { 1907 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1908 } 1909 }; 1910 1911 struct VecCUDAEquals 1912 { 1913 template <typename Tuple> 1914 __host__ __device__ 1915 void operator()(Tuple t) 1916 { 1917 thrust::get<1>(t) = thrust::get<0>(t); 1918 } 1919 }; 1920 1921 struct VecCUDAEqualsReverse 1922 { 1923 template <typename Tuple> 1924 __host__ __device__ 1925 void operator()(Tuple t) 1926 { 1927 thrust::get<0>(t) = thrust::get<1>(t); 1928 } 1929 }; 1930 1931 struct MatMatCusparse { 1932 PetscBool cisdense; 1933 PetscScalar *Bt; 1934 Mat X; 1935 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1936 PetscLogDouble flops; 1937 CsrMatrix *Bcsr; 1938 1939 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1940 cusparseSpMatDescr_t matSpBDescr; 1941 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1942 cusparseDnMatDescr_t matBDescr; 1943 cusparseDnMatDescr_t matCDescr; 1944 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1945 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 1946 void *dBuffer4; 1947 void *dBuffer5; 1948 #endif 1949 size_t mmBufferSize; 1950 void *mmBuffer; 1951 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1952 cusparseSpGEMMDescr_t spgemmDesc; 1953 #endif 1954 }; 1955 1956 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1957 { 1958 PetscErrorCode ierr; 1959 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1960 cudaError_t cerr; 1961 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1962 cusparseStatus_t stat; 1963 #endif 1964 1965 PetscFunctionBegin; 1966 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1967 delete mmdata->Bcsr; 1968 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1969 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1970 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1971 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1972 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1973 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 1974 if (mmdata->dBuffer4) { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); } 1975 if (mmdata->dBuffer5) { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); } 1976 #endif 1977 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1978 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1979 #endif 1980 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1981 ierr = PetscFree(data);CHKERRQ(ierr); 1982 PetscFunctionReturn(0); 1983 } 1984 1985 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1986 1987 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1988 { 1989 Mat_Product *product = C->product; 1990 Mat A,B; 1991 PetscInt m,n,blda,clda; 1992 PetscBool flg,biscuda; 1993 Mat_SeqAIJCUSPARSE *cusp; 1994 cusparseStatus_t stat; 1995 cusparseOperation_t opA; 1996 const PetscScalar *barray; 1997 PetscScalar *carray; 1998 PetscErrorCode ierr; 1999 MatMatCusparse *mmdata; 2000 Mat_SeqAIJCUSPARSEMultStruct *mat; 2001 CsrMatrix *csrmat; 2002 2003 PetscFunctionBegin; 2004 MatCheckProduct(C,1); 2005 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2006 mmdata = (MatMatCusparse*)product->data; 2007 A = product->A; 2008 B = product->B; 2009 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2010 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2011 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2012 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2013 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2014 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2015 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2016 switch (product->type) { 2017 case MATPRODUCT_AB: 2018 case MATPRODUCT_PtAP: 2019 mat = cusp->mat; 2020 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2021 m = A->rmap->n; 2022 n = B->cmap->n; 2023 break; 2024 case MATPRODUCT_AtB: 2025 if (!A->form_explicit_transpose) { 2026 mat = cusp->mat; 2027 opA = CUSPARSE_OPERATION_TRANSPOSE; 2028 } else { 2029 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2030 mat = cusp->matTranspose; 2031 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2032 } 2033 m = A->cmap->n; 2034 n = B->cmap->n; 2035 break; 2036 case MATPRODUCT_ABt: 2037 case MATPRODUCT_RARt: 2038 mat = cusp->mat; 2039 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2040 m = A->rmap->n; 2041 n = B->rmap->n; 2042 break; 2043 default: 2044 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2045 } 2046 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2047 csrmat = (CsrMatrix*)mat->mat; 2048 /* if the user passed a CPU matrix, copy the data to the GPU */ 2049 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2050 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2051 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2052 2053 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2054 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2055 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2056 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2057 } else { 2058 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2059 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2060 } 2061 2062 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2063 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2064 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2065 /* (re)allcoate mmBuffer if not initialized or LDAs are different */ 2066 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2067 size_t mmBufferSize; 2068 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2069 if (!mmdata->matBDescr) { 2070 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2071 mmdata->Blda = blda; 2072 } 2073 2074 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2075 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2076 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2077 mmdata->Clda = clda; 2078 } 2079 2080 if (!mat->matDescr) { 2081 stat = cusparseCreateCsr(&mat->matDescr, 2082 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2083 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2084 csrmat->values->data().get(), 2085 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2086 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2087 } 2088 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2089 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2090 mmdata->matCDescr,cusparse_scalartype, 2091 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2092 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2093 cudaError_t cerr; 2094 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2095 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2096 mmdata->mmBufferSize = mmBufferSize; 2097 } 2098 mmdata->initialized = PETSC_TRUE; 2099 } else { 2100 /* to be safe, always update pointers of the mats */ 2101 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2102 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2103 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2104 } 2105 2106 /* do cusparseSpMM, which supports transpose on B */ 2107 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2108 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2109 mmdata->matCDescr,cusparse_scalartype, 2110 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2111 #else 2112 PetscInt k; 2113 /* cusparseXcsrmm does not support transpose on B */ 2114 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2115 cublasHandle_t cublasv2handle; 2116 cublasStatus_t cerr; 2117 2118 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2119 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2120 B->cmap->n,B->rmap->n, 2121 &PETSC_CUSPARSE_ONE ,barray,blda, 2122 &PETSC_CUSPARSE_ZERO,barray,blda, 2123 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2124 blda = B->cmap->n; 2125 k = B->cmap->n; 2126 } else { 2127 k = B->rmap->n; 2128 } 2129 2130 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2131 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2132 csrmat->num_entries,mat->alpha_one,mat->descr, 2133 csrmat->values->data().get(), 2134 csrmat->row_offsets->data().get(), 2135 csrmat->column_indices->data().get(), 2136 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2137 carray,clda);CHKERRCUSPARSE(stat); 2138 #endif 2139 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2140 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2141 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2142 if (product->type == MATPRODUCT_RARt) { 2143 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2144 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2145 } else if (product->type == MATPRODUCT_PtAP) { 2146 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2147 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2148 } else { 2149 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2150 } 2151 if (mmdata->cisdense) { 2152 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2153 } 2154 if (!biscuda) { 2155 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2156 } 2157 PetscFunctionReturn(0); 2158 } 2159 2160 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2161 { 2162 Mat_Product *product = C->product; 2163 Mat A,B; 2164 PetscInt m,n; 2165 PetscBool cisdense,flg; 2166 PetscErrorCode ierr; 2167 MatMatCusparse *mmdata; 2168 Mat_SeqAIJCUSPARSE *cusp; 2169 2170 PetscFunctionBegin; 2171 MatCheckProduct(C,1); 2172 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2173 A = product->A; 2174 B = product->B; 2175 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2176 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2177 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2178 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2179 switch (product->type) { 2180 case MATPRODUCT_AB: 2181 m = A->rmap->n; 2182 n = B->cmap->n; 2183 break; 2184 case MATPRODUCT_AtB: 2185 m = A->cmap->n; 2186 n = B->cmap->n; 2187 break; 2188 case MATPRODUCT_ABt: 2189 m = A->rmap->n; 2190 n = B->rmap->n; 2191 break; 2192 case MATPRODUCT_PtAP: 2193 m = B->cmap->n; 2194 n = B->cmap->n; 2195 break; 2196 case MATPRODUCT_RARt: 2197 m = B->rmap->n; 2198 n = B->rmap->n; 2199 break; 2200 default: 2201 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2202 } 2203 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2204 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2205 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2206 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2207 2208 /* product data */ 2209 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2210 mmdata->cisdense = cisdense; 2211 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2212 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2213 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2214 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2215 } 2216 #endif 2217 /* for these products we need intermediate storage */ 2218 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2219 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2220 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2221 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2222 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2223 } else { 2224 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2225 } 2226 } 2227 C->product->data = mmdata; 2228 C->product->destroy = MatDestroy_MatMatCusparse; 2229 2230 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2231 PetscFunctionReturn(0); 2232 } 2233 2234 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2235 { 2236 Mat_Product *product = C->product; 2237 Mat A,B; 2238 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2239 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2240 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2241 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2242 PetscBool flg; 2243 PetscErrorCode ierr; 2244 cusparseStatus_t stat; 2245 cudaError_t cerr; 2246 MatProductType ptype; 2247 MatMatCusparse *mmdata; 2248 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2249 cusparseSpMatDescr_t BmatSpDescr; 2250 #endif 2251 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2252 2253 PetscFunctionBegin; 2254 MatCheckProduct(C,1); 2255 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2256 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2257 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2258 mmdata = (MatMatCusparse*)C->product->data; 2259 A = product->A; 2260 B = product->B; 2261 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2262 mmdata->reusesym = PETSC_FALSE; 2263 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2264 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2265 Cmat = Ccusp->mat; 2266 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2267 Ccsr = (CsrMatrix*)Cmat->mat; 2268 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2269 goto finalize; 2270 } 2271 if (!c->nz) goto finalize; 2272 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2273 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2274 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2275 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2276 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2277 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2278 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2279 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2280 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2281 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2282 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2283 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2284 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2285 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2286 2287 ptype = product->type; 2288 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2289 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2290 switch (ptype) { 2291 case MATPRODUCT_AB: 2292 Amat = Acusp->mat; 2293 Bmat = Bcusp->mat; 2294 break; 2295 case MATPRODUCT_AtB: 2296 Amat = Acusp->matTranspose; 2297 Bmat = Bcusp->mat; 2298 break; 2299 case MATPRODUCT_ABt: 2300 Amat = Acusp->mat; 2301 Bmat = Bcusp->matTranspose; 2302 break; 2303 default: 2304 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2305 } 2306 Cmat = Ccusp->mat; 2307 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2308 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2309 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2310 Acsr = (CsrMatrix*)Amat->mat; 2311 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2312 Ccsr = (CsrMatrix*)Cmat->mat; 2313 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2314 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2315 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2316 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2317 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2318 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2319 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2320 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2321 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2322 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2323 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2324 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2325 #else 2326 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2327 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2328 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2329 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2330 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2331 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2332 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2333 #endif 2334 #else 2335 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2336 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2337 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2338 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2339 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2340 #endif 2341 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2342 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2343 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2344 C->offloadmask = PETSC_OFFLOAD_GPU; 2345 finalize: 2346 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2347 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2348 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2349 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2350 c->reallocs = 0; 2351 C->info.mallocs += 0; 2352 C->info.nz_unneeded = 0; 2353 C->assembled = C->was_assembled = PETSC_TRUE; 2354 C->num_ass++; 2355 PetscFunctionReturn(0); 2356 } 2357 2358 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2359 { 2360 Mat_Product *product = C->product; 2361 Mat A,B; 2362 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2363 Mat_SeqAIJ *a,*b,*c; 2364 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2365 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2366 PetscInt i,j,m,n,k; 2367 PetscBool flg; 2368 PetscErrorCode ierr; 2369 cusparseStatus_t stat; 2370 cudaError_t cerr; 2371 MatProductType ptype; 2372 MatMatCusparse *mmdata; 2373 PetscLogDouble flops; 2374 PetscBool biscompressed,ciscompressed; 2375 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2376 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2377 cusparseSpMatDescr_t BmatSpDescr; 2378 #else 2379 int cnz; 2380 #endif 2381 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2382 2383 PetscFunctionBegin; 2384 MatCheckProduct(C,1); 2385 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2386 A = product->A; 2387 B = product->B; 2388 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2389 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2390 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2391 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2392 a = (Mat_SeqAIJ*)A->data; 2393 b = (Mat_SeqAIJ*)B->data; 2394 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2395 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2396 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2397 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2398 2399 /* product data */ 2400 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2401 C->product->data = mmdata; 2402 C->product->destroy = MatDestroy_MatMatCusparse; 2403 2404 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2405 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2406 ptype = product->type; 2407 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2408 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2409 biscompressed = PETSC_FALSE; 2410 ciscompressed = PETSC_FALSE; 2411 switch (ptype) { 2412 case MATPRODUCT_AB: 2413 m = A->rmap->n; 2414 n = B->cmap->n; 2415 k = A->cmap->n; 2416 Amat = Acusp->mat; 2417 Bmat = Bcusp->mat; 2418 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2419 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2420 break; 2421 case MATPRODUCT_AtB: 2422 m = A->cmap->n; 2423 n = B->cmap->n; 2424 k = A->rmap->n; 2425 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2426 Amat = Acusp->matTranspose; 2427 Bmat = Bcusp->mat; 2428 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2429 break; 2430 case MATPRODUCT_ABt: 2431 m = A->rmap->n; 2432 n = B->rmap->n; 2433 k = A->cmap->n; 2434 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 2435 Amat = Acusp->mat; 2436 Bmat = Bcusp->matTranspose; 2437 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2438 break; 2439 default: 2440 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2441 } 2442 2443 /* create cusparse matrix */ 2444 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2445 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2446 c = (Mat_SeqAIJ*)C->data; 2447 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2448 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2449 Ccsr = new CsrMatrix; 2450 2451 c->compressedrow.use = ciscompressed; 2452 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2453 c->compressedrow.nrows = a->compressedrow.nrows; 2454 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2455 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2456 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2457 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2458 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2459 } else { 2460 c->compressedrow.nrows = 0; 2461 c->compressedrow.i = NULL; 2462 c->compressedrow.rindex = NULL; 2463 Ccusp->workVector = NULL; 2464 Cmat->cprowIndices = NULL; 2465 } 2466 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2467 Ccusp->mat = Cmat; 2468 Ccusp->mat->mat = Ccsr; 2469 Ccsr->num_rows = Ccusp->nrows; 2470 Ccsr->num_cols = n; 2471 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2472 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2473 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2474 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2475 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2476 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2477 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2478 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2479 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2480 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2481 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2482 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2483 c->nz = 0; 2484 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2485 Ccsr->values = new THRUSTARRAY(c->nz); 2486 goto finalizesym; 2487 } 2488 2489 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2490 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2491 Acsr = (CsrMatrix*)Amat->mat; 2492 if (!biscompressed) { 2493 Bcsr = (CsrMatrix*)Bmat->mat; 2494 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2495 BmatSpDescr = Bmat->matDescr; 2496 #endif 2497 } else { /* we need to use row offsets for the full matrix */ 2498 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2499 Bcsr = new CsrMatrix; 2500 Bcsr->num_rows = B->rmap->n; 2501 Bcsr->num_cols = cBcsr->num_cols; 2502 Bcsr->num_entries = cBcsr->num_entries; 2503 Bcsr->column_indices = cBcsr->column_indices; 2504 Bcsr->values = cBcsr->values; 2505 if (!Bcusp->rowoffsets_gpu) { 2506 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2507 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2508 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2509 } 2510 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2511 mmdata->Bcsr = Bcsr; 2512 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2513 if (Bcsr->num_rows && Bcsr->num_cols) { 2514 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2515 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2516 Bcsr->values->data().get(), 2517 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2518 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2519 } 2520 BmatSpDescr = mmdata->matSpBDescr; 2521 #endif 2522 } 2523 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2524 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2525 /* precompute flops count */ 2526 if (ptype == MATPRODUCT_AB) { 2527 for (i=0, flops = 0; i<A->rmap->n; i++) { 2528 const PetscInt st = a->i[i]; 2529 const PetscInt en = a->i[i+1]; 2530 for (j=st; j<en; j++) { 2531 const PetscInt brow = a->j[j]; 2532 flops += 2.*(b->i[brow+1] - b->i[brow]); 2533 } 2534 } 2535 } else if (ptype == MATPRODUCT_AtB) { 2536 for (i=0, flops = 0; i<A->rmap->n; i++) { 2537 const PetscInt anzi = a->i[i+1] - a->i[i]; 2538 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2539 flops += (2.*anzi)*bnzi; 2540 } 2541 } else { /* TODO */ 2542 flops = 0.; 2543 } 2544 2545 mmdata->flops = flops; 2546 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2547 2548 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2549 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2550 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2551 NULL, NULL, NULL, 2552 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2553 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2554 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2555 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2556 { 2557 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2558 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2559 */ 2560 void* dBuffer1 = NULL; 2561 void* dBuffer2 = NULL; 2562 void* dBuffer3 = NULL; 2563 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2564 size_t bufferSize1 = 0; 2565 size_t bufferSize2 = 0; 2566 size_t bufferSize3 = 0; 2567 size_t bufferSize4 = 0; 2568 size_t bufferSize5 = 0; 2569 2570 /*----------------------------------------------------------------------*/ 2571 /* ask bufferSize1 bytes for external memory */ 2572 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2573 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2574 &bufferSize1, NULL);CHKERRCUSPARSE(stat); 2575 cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr); 2576 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2577 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2578 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2579 &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat); 2580 2581 /*----------------------------------------------------------------------*/ 2582 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2583 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2584 &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat); 2585 cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr); 2586 cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr); 2587 cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr); 2588 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2589 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2590 &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat); 2591 cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr); 2592 cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr); 2593 2594 /*----------------------------------------------------------------------*/ 2595 /* get matrix C non-zero entries C_nnz1 */ 2596 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2597 c->nz = (PetscInt) C_nnz1; 2598 /* allocate matrix C */ 2599 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2600 Ccsr->values = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2601 /* update matC with the new pointers */ 2602 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2603 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2604 2605 /*----------------------------------------------------------------------*/ 2606 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2607 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2608 &bufferSize5, NULL);CHKERRCUSPARSE(stat); 2609 cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr); 2610 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2611 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2612 &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat); 2613 cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr); 2614 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2615 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2616 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2617 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2618 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr); 2619 } 2620 #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2621 size_t bufSize2; 2622 /* ask bufferSize bytes for external memory */ 2623 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2624 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2625 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2626 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2627 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2628 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2629 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2630 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2631 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2632 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2633 /* ask bufferSize again bytes for external memory */ 2634 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2635 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2636 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2637 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2638 /* The CUSPARSE documentation is not clear, nor the API 2639 We need both buffers to perform the operations properly! 2640 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2641 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2642 is stored in the descriptor! What a messy API... */ 2643 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2644 /* compute the intermediate product of A * B */ 2645 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2646 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2647 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2648 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2649 /* get matrix C non-zero entries C_nnz1 */ 2650 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2651 c->nz = (PetscInt) C_nnz1; 2652 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2653 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2654 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2655 Ccsr->values = new THRUSTARRAY(c->nz); 2656 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2657 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2658 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2659 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2660 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2661 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2662 #endif 2663 #else 2664 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2665 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2666 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2667 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2668 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2669 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2670 c->nz = cnz; 2671 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2672 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2673 Ccsr->values = new THRUSTARRAY(c->nz); 2674 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2675 2676 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2677 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2678 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2679 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2680 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2681 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2682 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2683 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2684 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2685 #endif 2686 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2687 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2688 finalizesym: 2689 c->singlemalloc = PETSC_FALSE; 2690 c->free_a = PETSC_TRUE; 2691 c->free_ij = PETSC_TRUE; 2692 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2693 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2694 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2695 PetscInt *d_i = c->i; 2696 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2697 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2698 ii = *Ccsr->row_offsets; 2699 jj = *Ccsr->column_indices; 2700 if (ciscompressed) d_i = c->compressedrow.i; 2701 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2702 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2703 } else { 2704 PetscInt *d_i = c->i; 2705 if (ciscompressed) d_i = c->compressedrow.i; 2706 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2707 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2708 } 2709 if (ciscompressed) { /* need to expand host row offsets */ 2710 PetscInt r = 0; 2711 c->i[0] = 0; 2712 for (k = 0; k < c->compressedrow.nrows; k++) { 2713 const PetscInt next = c->compressedrow.rindex[k]; 2714 const PetscInt old = c->compressedrow.i[k]; 2715 for (; r < next; r++) c->i[r+1] = old; 2716 } 2717 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2718 } 2719 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2720 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2721 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2722 c->maxnz = c->nz; 2723 c->nonzerorowcnt = 0; 2724 c->rmax = 0; 2725 for (k = 0; k < m; k++) { 2726 const PetscInt nn = c->i[k+1] - c->i[k]; 2727 c->ilen[k] = c->imax[k] = nn; 2728 c->nonzerorowcnt += (PetscInt)!!nn; 2729 c->rmax = PetscMax(c->rmax,nn); 2730 } 2731 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2732 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2733 Ccsr->num_entries = c->nz; 2734 2735 C->nonzerostate++; 2736 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2737 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2738 Ccusp->nonzerostate = C->nonzerostate; 2739 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2740 C->preallocated = PETSC_TRUE; 2741 C->assembled = PETSC_FALSE; 2742 C->was_assembled = PETSC_FALSE; 2743 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2744 mmdata->reusesym = PETSC_TRUE; 2745 C->offloadmask = PETSC_OFFLOAD_GPU; 2746 } 2747 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2748 PetscFunctionReturn(0); 2749 } 2750 2751 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2752 2753 /* handles sparse or dense B */ 2754 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2755 { 2756 Mat_Product *product = mat->product; 2757 PetscErrorCode ierr; 2758 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2759 2760 PetscFunctionBegin; 2761 MatCheckProduct(mat,1); 2762 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2763 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2764 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2765 } 2766 if (product->type == MATPRODUCT_ABC) { 2767 Ciscusp = PETSC_FALSE; 2768 if (!product->C->boundtocpu) { 2769 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2770 } 2771 } 2772 if (isdense) { 2773 switch (product->type) { 2774 case MATPRODUCT_AB: 2775 case MATPRODUCT_AtB: 2776 case MATPRODUCT_ABt: 2777 case MATPRODUCT_PtAP: 2778 case MATPRODUCT_RARt: 2779 if (product->A->boundtocpu) { 2780 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2781 } else { 2782 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2783 } 2784 break; 2785 case MATPRODUCT_ABC: 2786 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2787 break; 2788 default: 2789 break; 2790 } 2791 } else if (Biscusp && Ciscusp) { 2792 switch (product->type) { 2793 case MATPRODUCT_AB: 2794 case MATPRODUCT_AtB: 2795 case MATPRODUCT_ABt: 2796 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2797 break; 2798 case MATPRODUCT_PtAP: 2799 case MATPRODUCT_RARt: 2800 case MATPRODUCT_ABC: 2801 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2802 break; 2803 default: 2804 break; 2805 } 2806 } else { /* fallback for AIJ */ 2807 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2808 } 2809 PetscFunctionReturn(0); 2810 } 2811 2812 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2813 { 2814 PetscErrorCode ierr; 2815 2816 PetscFunctionBegin; 2817 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2818 PetscFunctionReturn(0); 2819 } 2820 2821 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2822 { 2823 PetscErrorCode ierr; 2824 2825 PetscFunctionBegin; 2826 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2827 PetscFunctionReturn(0); 2828 } 2829 2830 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2831 { 2832 PetscErrorCode ierr; 2833 2834 PetscFunctionBegin; 2835 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2836 PetscFunctionReturn(0); 2837 } 2838 2839 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2840 { 2841 PetscErrorCode ierr; 2842 2843 PetscFunctionBegin; 2844 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2845 PetscFunctionReturn(0); 2846 } 2847 2848 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2849 { 2850 PetscErrorCode ierr; 2851 2852 PetscFunctionBegin; 2853 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2854 PetscFunctionReturn(0); 2855 } 2856 2857 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2858 { 2859 int i = blockIdx.x*blockDim.x + threadIdx.x; 2860 if (i < n) y[idx[i]] += x[i]; 2861 } 2862 2863 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2864 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2865 { 2866 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2867 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2868 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2869 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2870 PetscErrorCode ierr; 2871 cusparseStatus_t stat; 2872 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2873 PetscBool compressed; 2874 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2875 PetscInt nx,ny; 2876 #endif 2877 2878 PetscFunctionBegin; 2879 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 2880 if (!a->nonzerorowcnt) { 2881 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2882 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2883 PetscFunctionReturn(0); 2884 } 2885 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2886 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2887 if (!trans) { 2888 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2889 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2890 } else { 2891 if (herm || !A->form_explicit_transpose) { 2892 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2893 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2894 } else { 2895 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);} 2896 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2897 } 2898 } 2899 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2900 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2901 2902 try { 2903 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2904 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2905 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2906 2907 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2908 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2909 /* z = A x + beta y. 2910 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2911 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2912 */ 2913 xptr = xarray; 2914 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2915 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2916 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2917 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2918 allocated to accommodate different uses. So we get the length info directly from mat. 2919 */ 2920 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2921 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2922 nx = mat->num_cols; 2923 ny = mat->num_rows; 2924 } 2925 #endif 2926 } else { 2927 /* z = A^T x + beta y 2928 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2929 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2930 */ 2931 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2932 dptr = zarray; 2933 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2934 if (compressed) { /* Scatter x to work vector */ 2935 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2936 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2937 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2938 VecCUDAEqualsReverse()); 2939 } 2940 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2941 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2942 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2943 nx = mat->num_rows; 2944 ny = mat->num_cols; 2945 } 2946 #endif 2947 } 2948 2949 /* csr_spmv does y = alpha op(A) x + beta y */ 2950 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2951 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2952 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2953 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2954 cudaError_t cerr; 2955 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2956 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2957 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2958 matstruct->matDescr, 2959 matstruct->cuSpMV[opA].vecXDescr, beta, 2960 matstruct->cuSpMV[opA].vecYDescr, 2961 cusparse_scalartype, 2962 cusparsestruct->spmvAlg, 2963 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2964 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2965 2966 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2967 } else { 2968 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2969 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2970 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2971 } 2972 2973 stat = cusparseSpMV(cusparsestruct->handle, opA, 2974 matstruct->alpha_one, 2975 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */ 2976 matstruct->cuSpMV[opA].vecXDescr, 2977 beta, 2978 matstruct->cuSpMV[opA].vecYDescr, 2979 cusparse_scalartype, 2980 cusparsestruct->spmvAlg, 2981 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2982 #else 2983 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2984 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2985 mat->num_rows, mat->num_cols, 2986 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2987 mat->values->data().get(), mat->row_offsets->data().get(), 2988 mat->column_indices->data().get(), xptr, beta, 2989 dptr);CHKERRCUSPARSE(stat); 2990 #endif 2991 } else { 2992 if (cusparsestruct->nrows) { 2993 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2994 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2995 #else 2996 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2997 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2998 matstruct->alpha_one, matstruct->descr, hybMat, 2999 xptr, beta, 3000 dptr);CHKERRCUSPARSE(stat); 3001 #endif 3002 } 3003 } 3004 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3005 3006 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3007 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3008 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3009 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 3010 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3011 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3012 } 3013 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3014 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 3015 } 3016 3017 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3018 if (compressed) { 3019 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3020 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3021 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3022 prevent that. So I just add a ScatterAdd kernel. 3023 */ 3024 #if 0 3025 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3026 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3027 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3028 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3029 VecCUDAPlusEquals()); 3030 #else 3031 PetscInt n = matstruct->cprowIndices->size(); 3032 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3033 #endif 3034 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3035 } 3036 } else { 3037 if (yy && yy != zz) { 3038 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3039 } 3040 } 3041 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3042 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 3043 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 3044 } catch(char *ex) { 3045 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3046 } 3047 if (yy) { 3048 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 3049 } else { 3050 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 3051 } 3052 PetscFunctionReturn(0); 3053 } 3054 3055 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3056 { 3057 PetscErrorCode ierr; 3058 3059 PetscFunctionBegin; 3060 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3061 PetscFunctionReturn(0); 3062 } 3063 3064 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3065 { 3066 PetscErrorCode ierr; 3067 PetscObjectState onnz = A->nonzerostate; 3068 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3069 3070 PetscFunctionBegin; 3071 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 3072 if (onnz != A->nonzerostate && cusp->deviceMat) { 3073 cudaError_t cerr; 3074 3075 ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 3076 cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr); 3077 cusp->deviceMat = NULL; 3078 } 3079 PetscFunctionReturn(0); 3080 } 3081 3082 /* --------------------------------------------------------------------------------*/ 3083 /*@ 3084 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3085 (the default parallel PETSc format). This matrix will ultimately pushed down 3086 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3087 assembly performance the user should preallocate the matrix storage by setting 3088 the parameter nz (or the array nnz). By setting these parameters accurately, 3089 performance during matrix assembly can be increased by more than a factor of 50. 3090 3091 Collective 3092 3093 Input Parameters: 3094 + comm - MPI communicator, set to PETSC_COMM_SELF 3095 . m - number of rows 3096 . n - number of columns 3097 . nz - number of nonzeros per row (same for all rows) 3098 - nnz - array containing the number of nonzeros in the various rows 3099 (possibly different for each row) or NULL 3100 3101 Output Parameter: 3102 . A - the matrix 3103 3104 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3105 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3106 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3107 3108 Notes: 3109 If nnz is given then nz is ignored 3110 3111 The AIJ format (also called the Yale sparse matrix format or 3112 compressed row storage), is fully compatible with standard Fortran 77 3113 storage. That is, the stored row and column indices can begin at 3114 either one (as in Fortran) or zero. See the users' manual for details. 3115 3116 Specify the preallocated storage with either nz or nnz (not both). 3117 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3118 allocation. For large problems you MUST preallocate memory or you 3119 will get TERRIBLE performance, see the users' manual chapter on matrices. 3120 3121 By default, this format uses inodes (identical nodes) when possible, to 3122 improve numerical efficiency of matrix-vector products and solves. We 3123 search for consecutive rows with the same nonzero structure, thereby 3124 reusing matrix information to achieve increased efficiency. 3125 3126 Level: intermediate 3127 3128 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3129 @*/ 3130 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3131 { 3132 PetscErrorCode ierr; 3133 3134 PetscFunctionBegin; 3135 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3136 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3137 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3138 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3139 PetscFunctionReturn(0); 3140 } 3141 3142 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3143 { 3144 PetscErrorCode ierr; 3145 3146 PetscFunctionBegin; 3147 if (A->factortype == MAT_FACTOR_NONE) { 3148 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3149 } else { 3150 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3151 } 3152 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3153 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3154 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3155 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3156 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3157 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3158 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3159 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3160 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3161 PetscFunctionReturn(0); 3162 } 3163 3164 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3165 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3166 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3167 { 3168 PetscErrorCode ierr; 3169 3170 PetscFunctionBegin; 3171 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3172 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3173 PetscFunctionReturn(0); 3174 } 3175 3176 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3177 { 3178 PetscErrorCode ierr; 3179 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3180 Mat_SeqAIJCUSPARSE *cy; 3181 Mat_SeqAIJCUSPARSE *cx; 3182 PetscScalar *ay; 3183 const PetscScalar *ax; 3184 CsrMatrix *csry,*csrx; 3185 3186 PetscFunctionBegin; 3187 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3188 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3189 if (X->ops->axpy != Y->ops->axpy) { 3190 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3191 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3192 PetscFunctionReturn(0); 3193 } 3194 /* if we are here, it means both matrices are bound to GPU */ 3195 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3196 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3197 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3198 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3199 csry = (CsrMatrix*)cy->mat->mat; 3200 csrx = (CsrMatrix*)cx->mat->mat; 3201 /* see if we can turn this into a cublas axpy */ 3202 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3203 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3204 if (eq) { 3205 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3206 } 3207 if (eq) str = SAME_NONZERO_PATTERN; 3208 } 3209 /* spgeam is buggy with one column */ 3210 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3211 3212 if (str == SUBSET_NONZERO_PATTERN) { 3213 cusparseStatus_t stat; 3214 PetscScalar b = 1.0; 3215 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3216 size_t bufferSize; 3217 void *buffer; 3218 cudaError_t cerr; 3219 #endif 3220 3221 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3222 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3223 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3224 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3225 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3226 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3227 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3228 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3229 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3230 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3231 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3232 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3233 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3234 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3235 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3236 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3237 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3238 #else 3239 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3240 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3241 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3242 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3243 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3244 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3245 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3246 #endif 3247 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3248 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3249 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3250 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3251 } else if (str == SAME_NONZERO_PATTERN) { 3252 cublasHandle_t cublasv2handle; 3253 cublasStatus_t berr; 3254 PetscBLASInt one = 1, bnz = 1; 3255 3256 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3257 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3258 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3259 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3260 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3261 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3262 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3263 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3264 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3265 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3266 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3267 } else { 3268 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3269 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3270 } 3271 PetscFunctionReturn(0); 3272 } 3273 3274 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3275 { 3276 PetscErrorCode ierr; 3277 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3278 PetscScalar *ay; 3279 cublasHandle_t cublasv2handle; 3280 cublasStatus_t berr; 3281 PetscBLASInt one = 1, bnz = 1; 3282 3283 PetscFunctionBegin; 3284 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3285 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3286 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3287 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3288 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3289 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3290 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3291 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3292 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3293 PetscFunctionReturn(0); 3294 } 3295 3296 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3297 { 3298 PetscErrorCode ierr; 3299 PetscBool both = PETSC_FALSE; 3300 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3301 3302 PetscFunctionBegin; 3303 if (A->factortype == MAT_FACTOR_NONE) { 3304 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3305 if (spptr->mat) { 3306 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3307 if (matrix->values) { 3308 both = PETSC_TRUE; 3309 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3310 } 3311 } 3312 if (spptr->matTranspose) { 3313 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3314 if (matrix->values) { 3315 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3316 } 3317 } 3318 } 3319 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3320 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3321 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3322 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3323 else A->offloadmask = PETSC_OFFLOAD_CPU; 3324 PetscFunctionReturn(0); 3325 } 3326 3327 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3328 { 3329 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3330 PetscErrorCode ierr; 3331 3332 PetscFunctionBegin; 3333 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3334 if (flg) { 3335 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3336 3337 A->ops->scale = MatScale_SeqAIJ; 3338 A->ops->axpy = MatAXPY_SeqAIJ; 3339 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3340 A->ops->mult = MatMult_SeqAIJ; 3341 A->ops->multadd = MatMultAdd_SeqAIJ; 3342 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3343 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3344 A->ops->multhermitiantranspose = NULL; 3345 A->ops->multhermitiantransposeadd = NULL; 3346 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3347 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3348 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3349 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3350 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3351 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3352 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3353 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3354 } else { 3355 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3356 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3357 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3358 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3359 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3360 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3361 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3362 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3363 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3364 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3365 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3366 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3367 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3368 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3369 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3370 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3371 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3372 } 3373 A->boundtocpu = flg; 3374 a->inode.use = flg; 3375 PetscFunctionReturn(0); 3376 } 3377 3378 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3379 { 3380 PetscErrorCode ierr; 3381 cusparseStatus_t stat; 3382 Mat B; 3383 3384 PetscFunctionBegin; 3385 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3386 if (reuse == MAT_INITIAL_MATRIX) { 3387 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3388 } else if (reuse == MAT_REUSE_MATRIX) { 3389 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3390 } 3391 B = *newmat; 3392 3393 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3394 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3395 3396 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3397 if (B->factortype == MAT_FACTOR_NONE) { 3398 Mat_SeqAIJCUSPARSE *spptr; 3399 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3400 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3401 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3402 spptr->format = MAT_CUSPARSE_CSR; 3403 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3404 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3405 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3406 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3407 #endif 3408 B->spptr = spptr; 3409 } else { 3410 Mat_SeqAIJCUSPARSETriFactors *spptr; 3411 3412 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3413 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3414 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3415 B->spptr = spptr; 3416 } 3417 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3418 } 3419 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3420 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3421 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3422 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3423 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3424 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3425 3426 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3427 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3428 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3429 PetscFunctionReturn(0); 3430 } 3431 3432 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3433 { 3434 PetscErrorCode ierr; 3435 3436 PetscFunctionBegin; 3437 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3438 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3439 PetscFunctionReturn(0); 3440 } 3441 3442 /*MC 3443 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3444 3445 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3446 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3447 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3448 3449 Options Database Keys: 3450 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3451 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3452 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3453 3454 Level: beginner 3455 3456 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3457 M*/ 3458 3459 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3460 3461 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3462 { 3463 PetscErrorCode ierr; 3464 3465 PetscFunctionBegin; 3466 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 3467 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3468 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3469 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3470 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3471 3472 PetscFunctionReturn(0); 3473 } 3474 3475 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3476 { 3477 PetscErrorCode ierr; 3478 cusparseStatus_t stat; 3479 3480 PetscFunctionBegin; 3481 if (*cusparsestruct) { 3482 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3483 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3484 delete (*cusparsestruct)->workVector; 3485 delete (*cusparsestruct)->rowoffsets_gpu; 3486 delete (*cusparsestruct)->cooPerm; 3487 delete (*cusparsestruct)->cooPerm_a; 3488 delete (*cusparsestruct)->csr2csc_i; 3489 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3490 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3491 } 3492 PetscFunctionReturn(0); 3493 } 3494 3495 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3496 { 3497 PetscFunctionBegin; 3498 if (*mat) { 3499 delete (*mat)->values; 3500 delete (*mat)->column_indices; 3501 delete (*mat)->row_offsets; 3502 delete *mat; 3503 *mat = 0; 3504 } 3505 PetscFunctionReturn(0); 3506 } 3507 3508 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3509 { 3510 cusparseStatus_t stat; 3511 PetscErrorCode ierr; 3512 3513 PetscFunctionBegin; 3514 if (*trifactor) { 3515 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3516 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3517 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3518 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3519 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3520 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3521 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3522 #endif 3523 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3524 } 3525 PetscFunctionReturn(0); 3526 } 3527 3528 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3529 { 3530 CsrMatrix *mat; 3531 cusparseStatus_t stat; 3532 cudaError_t err; 3533 3534 PetscFunctionBegin; 3535 if (*matstruct) { 3536 if ((*matstruct)->mat) { 3537 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3538 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3539 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3540 #else 3541 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3542 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3543 #endif 3544 } else { 3545 mat = (CsrMatrix*)(*matstruct)->mat; 3546 CsrMatrix_Destroy(&mat); 3547 } 3548 } 3549 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3550 delete (*matstruct)->cprowIndices; 3551 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3552 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3553 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3554 3555 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3556 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3557 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3558 for (int i=0; i<3; i++) { 3559 if (mdata->cuSpMV[i].initialized) { 3560 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3561 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3562 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3563 } 3564 } 3565 #endif 3566 delete *matstruct; 3567 *matstruct = NULL; 3568 } 3569 PetscFunctionReturn(0); 3570 } 3571 3572 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3573 { 3574 PetscErrorCode ierr; 3575 3576 PetscFunctionBegin; 3577 if (*trifactors) { 3578 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3579 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3580 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3581 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3582 delete (*trifactors)->rpermIndices; 3583 delete (*trifactors)->cpermIndices; 3584 delete (*trifactors)->workVector; 3585 (*trifactors)->rpermIndices = NULL; 3586 (*trifactors)->cpermIndices = NULL; 3587 (*trifactors)->workVector = NULL; 3588 if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3589 if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3590 (*trifactors)->init_dev_prop = PETSC_FALSE; 3591 } 3592 PetscFunctionReturn(0); 3593 } 3594 3595 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3596 { 3597 PetscErrorCode ierr; 3598 cusparseHandle_t handle; 3599 cusparseStatus_t stat; 3600 3601 PetscFunctionBegin; 3602 if (*trifactors) { 3603 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3604 if (handle = (*trifactors)->handle) { 3605 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3606 } 3607 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3608 } 3609 PetscFunctionReturn(0); 3610 } 3611 3612 struct IJCompare 3613 { 3614 __host__ __device__ 3615 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3616 { 3617 if (t1.get<0>() < t2.get<0>()) return true; 3618 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3619 return false; 3620 } 3621 }; 3622 3623 struct IJEqual 3624 { 3625 __host__ __device__ 3626 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3627 { 3628 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3629 return true; 3630 } 3631 }; 3632 3633 struct IJDiff 3634 { 3635 __host__ __device__ 3636 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3637 { 3638 return t1 == t2 ? 0 : 1; 3639 } 3640 }; 3641 3642 struct IJSum 3643 { 3644 __host__ __device__ 3645 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3646 { 3647 return t1||t2; 3648 } 3649 }; 3650 3651 #include <thrust/iterator/discard_iterator.h> 3652 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3653 { 3654 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3655 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3656 THRUSTARRAY *cooPerm_v = NULL; 3657 thrust::device_ptr<const PetscScalar> d_v; 3658 CsrMatrix *matrix; 3659 PetscErrorCode ierr; 3660 PetscInt n; 3661 3662 PetscFunctionBegin; 3663 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3664 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3665 if (!cusp->cooPerm) { 3666 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3667 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3668 PetscFunctionReturn(0); 3669 } 3670 matrix = (CsrMatrix*)cusp->mat->mat; 3671 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3672 if (!v) { 3673 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3674 goto finalize; 3675 } 3676 n = cusp->cooPerm->size(); 3677 if (isCudaMem(v)) { 3678 d_v = thrust::device_pointer_cast(v); 3679 } else { 3680 cooPerm_v = new THRUSTARRAY(n); 3681 cooPerm_v->assign(v,v+n); 3682 d_v = cooPerm_v->data(); 3683 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3684 } 3685 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3686 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3687 if (cusp->cooPerm_a) { 3688 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3689 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3690 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3691 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3692 delete cooPerm_w; 3693 } else { 3694 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3695 matrix->values->begin())); 3696 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3697 matrix->values->end())); 3698 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 3699 } 3700 } else { 3701 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3702 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3703 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3704 } else { 3705 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3706 matrix->values->begin())); 3707 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3708 matrix->values->end())); 3709 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3710 } 3711 } 3712 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3713 finalize: 3714 delete cooPerm_v; 3715 A->offloadmask = PETSC_OFFLOAD_GPU; 3716 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3717 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3718 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3719 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3720 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3721 a->reallocs = 0; 3722 A->info.mallocs += 0; 3723 A->info.nz_unneeded = 0; 3724 A->assembled = A->was_assembled = PETSC_TRUE; 3725 A->num_ass++; 3726 PetscFunctionReturn(0); 3727 } 3728 3729 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3730 { 3731 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3732 PetscErrorCode ierr; 3733 3734 PetscFunctionBegin; 3735 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3736 if (!cusp) PetscFunctionReturn(0); 3737 if (destroy) { 3738 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3739 delete cusp->csr2csc_i; 3740 cusp->csr2csc_i = NULL; 3741 } 3742 A->transupdated = PETSC_FALSE; 3743 PetscFunctionReturn(0); 3744 } 3745 3746 #include <thrust/binary_search.h> 3747 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3748 { 3749 PetscErrorCode ierr; 3750 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3751 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3752 PetscInt cooPerm_n, nzr = 0; 3753 cudaError_t cerr; 3754 3755 PetscFunctionBegin; 3756 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3757 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3758 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3759 if (n != cooPerm_n) { 3760 delete cusp->cooPerm; 3761 delete cusp->cooPerm_a; 3762 cusp->cooPerm = NULL; 3763 cusp->cooPerm_a = NULL; 3764 } 3765 if (n) { 3766 THRUSTINTARRAY d_i(n); 3767 THRUSTINTARRAY d_j(n); 3768 THRUSTINTARRAY ii(A->rmap->n); 3769 3770 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3771 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3772 3773 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3774 d_i.assign(coo_i,coo_i+n); 3775 d_j.assign(coo_j,coo_j+n); 3776 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3777 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3778 3779 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3780 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3781 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 3782 *cusp->cooPerm_a = d_i; 3783 THRUSTINTARRAY w = d_j; 3784 3785 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 3786 if (nekey == ekey) { /* all entries are unique */ 3787 delete cusp->cooPerm_a; 3788 cusp->cooPerm_a = NULL; 3789 } else { /* I couldn't come up with a more elegant algorithm */ 3790 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 3791 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 3792 (*cusp->cooPerm_a)[0] = 0; 3793 w[0] = 0; 3794 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 3795 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 3796 } 3797 thrust::counting_iterator<PetscInt> search_begin(0); 3798 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 3799 search_begin, search_begin + A->rmap->n, 3800 ii.begin()); 3801 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3802 3803 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3804 a->singlemalloc = PETSC_FALSE; 3805 a->free_a = PETSC_TRUE; 3806 a->free_ij = PETSC_TRUE; 3807 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3808 a->i[0] = 0; 3809 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3810 a->nz = a->maxnz = a->i[A->rmap->n]; 3811 a->rmax = 0; 3812 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3813 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3814 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3815 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3816 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3817 for (PetscInt i = 0; i < A->rmap->n; i++) { 3818 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3819 nzr += (PetscInt)!!(nnzr); 3820 a->ilen[i] = a->imax[i] = nnzr; 3821 a->rmax = PetscMax(a->rmax,nnzr); 3822 } 3823 a->nonzerorowcnt = nzr; 3824 A->preallocated = PETSC_TRUE; 3825 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3826 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3827 } else { 3828 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3829 } 3830 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3831 3832 /* We want to allocate the CUSPARSE struct for matvec now. 3833 The code is so convoluted now that I prefer to copy zeros */ 3834 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3835 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3836 A->offloadmask = PETSC_OFFLOAD_CPU; 3837 A->nonzerostate++; 3838 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3839 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 3840 3841 A->assembled = PETSC_FALSE; 3842 A->was_assembled = PETSC_FALSE; 3843 PetscFunctionReturn(0); 3844 } 3845 3846 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 3847 { 3848 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3849 CsrMatrix *csr; 3850 PetscErrorCode ierr; 3851 3852 PetscFunctionBegin; 3853 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3854 PetscValidPointer(a,2); 3855 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3856 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3857 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3858 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3859 csr = (CsrMatrix*)cusp->mat->mat; 3860 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3861 *a = csr->values->data().get(); 3862 PetscFunctionReturn(0); 3863 } 3864 3865 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 3866 { 3867 PetscFunctionBegin; 3868 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3869 PetscValidPointer(a,2); 3870 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3871 *a = NULL; 3872 PetscFunctionReturn(0); 3873 } 3874 3875 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 3876 { 3877 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3878 CsrMatrix *csr; 3879 PetscErrorCode ierr; 3880 3881 PetscFunctionBegin; 3882 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3883 PetscValidPointer(a,2); 3884 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3885 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3886 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3887 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3888 csr = (CsrMatrix*)cusp->mat->mat; 3889 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3890 *a = csr->values->data().get(); 3891 A->offloadmask = PETSC_OFFLOAD_GPU; 3892 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3893 PetscFunctionReturn(0); 3894 } 3895 3896 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 3897 { 3898 PetscErrorCode ierr; 3899 3900 PetscFunctionBegin; 3901 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3902 PetscValidPointer(a,2); 3903 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3904 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3905 *a = NULL; 3906 PetscFunctionReturn(0); 3907 } 3908 3909 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 3910 { 3911 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3912 CsrMatrix *csr; 3913 PetscErrorCode ierr; 3914 3915 PetscFunctionBegin; 3916 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3917 PetscValidPointer(a,2); 3918 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3919 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3920 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3921 csr = (CsrMatrix*)cusp->mat->mat; 3922 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3923 *a = csr->values->data().get(); 3924 A->offloadmask = PETSC_OFFLOAD_GPU; 3925 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3926 PetscFunctionReturn(0); 3927 } 3928 3929 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 3930 { 3931 PetscErrorCode ierr; 3932 3933 PetscFunctionBegin; 3934 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3935 PetscValidPointer(a,2); 3936 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3937 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3938 *a = NULL; 3939 PetscFunctionReturn(0); 3940 } 3941 3942 struct IJCompare4 3943 { 3944 __host__ __device__ 3945 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 3946 { 3947 if (t1.get<0>() < t2.get<0>()) return true; 3948 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3949 return false; 3950 } 3951 }; 3952 3953 struct Shift 3954 { 3955 int _shift; 3956 3957 Shift(int shift) : _shift(shift) {} 3958 __host__ __device__ 3959 inline int operator() (const int &c) 3960 { 3961 return c + _shift; 3962 } 3963 }; 3964 3965 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */ 3966 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 3967 { 3968 PetscErrorCode ierr; 3969 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 3970 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 3971 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 3972 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3973 PetscInt Annz,Bnnz; 3974 cusparseStatus_t stat; 3975 PetscInt i,m,n,zero = 0; 3976 cudaError_t cerr; 3977 3978 PetscFunctionBegin; 3979 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3980 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 3981 PetscValidPointer(C,4); 3982 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3983 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 3984 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 3985 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 3986 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3987 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3988 if (reuse == MAT_INITIAL_MATRIX) { 3989 m = A->rmap->n; 3990 n = A->cmap->n + B->cmap->n; 3991 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 3992 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 3993 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3994 c = (Mat_SeqAIJ*)(*C)->data; 3995 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3996 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3997 Ccsr = new CsrMatrix; 3998 Cmat->cprowIndices = NULL; 3999 c->compressedrow.use = PETSC_FALSE; 4000 c->compressedrow.nrows = 0; 4001 c->compressedrow.i = NULL; 4002 c->compressedrow.rindex = NULL; 4003 Ccusp->workVector = NULL; 4004 Ccusp->nrows = m; 4005 Ccusp->mat = Cmat; 4006 Ccusp->mat->mat = Ccsr; 4007 Ccsr->num_rows = m; 4008 Ccsr->num_cols = n; 4009 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 4010 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4011 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4012 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4013 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4014 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4015 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4016 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4017 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4018 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4019 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4020 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 4021 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 4022 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4023 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4024 4025 Acsr = (CsrMatrix*)Acusp->mat->mat; 4026 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4027 Annz = (PetscInt)Acsr->column_indices->size(); 4028 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4029 c->nz = Annz + Bnnz; 4030 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4031 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4032 Ccsr->values = new THRUSTARRAY(c->nz); 4033 Ccsr->num_entries = c->nz; 4034 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4035 if (c->nz) { 4036 auto Acoo = new THRUSTINTARRAY32(Annz); 4037 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4038 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4039 THRUSTINTARRAY32 *Aroff,*Broff; 4040 4041 if (a->compressedrow.use) { /* need full row offset */ 4042 if (!Acusp->rowoffsets_gpu) { 4043 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4044 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4045 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4046 } 4047 Aroff = Acusp->rowoffsets_gpu; 4048 } else Aroff = Acsr->row_offsets; 4049 if (b->compressedrow.use) { /* need full row offset */ 4050 if (!Bcusp->rowoffsets_gpu) { 4051 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4052 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4053 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4054 } 4055 Broff = Bcusp->rowoffsets_gpu; 4056 } else Broff = Bcsr->row_offsets; 4057 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4058 stat = cusparseXcsr2coo(Acusp->handle, 4059 Aroff->data().get(), 4060 Annz, 4061 m, 4062 Acoo->data().get(), 4063 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4064 stat = cusparseXcsr2coo(Bcusp->handle, 4065 Broff->data().get(), 4066 Bnnz, 4067 m, 4068 Bcoo->data().get(), 4069 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4070 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4071 auto Aperm = thrust::make_constant_iterator(1); 4072 auto Bperm = thrust::make_constant_iterator(0); 4073 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4074 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4075 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4076 #else 4077 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4078 auto Bcib = Bcsr->column_indices->begin(); 4079 auto Bcie = Bcsr->column_indices->end(); 4080 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4081 #endif 4082 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4083 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4084 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4085 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4086 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4087 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4088 auto p1 = Ccusp->cooPerm->begin(); 4089 auto p2 = Ccusp->cooPerm->begin(); 4090 thrust::advance(p2,Annz); 4091 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4092 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4093 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4094 #endif 4095 auto cci = thrust::make_counting_iterator(zero); 4096 auto cce = thrust::make_counting_iterator(c->nz); 4097 #if 0 //Errors on SUMMIT cuda 11.1.0 4098 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4099 #else 4100 auto pred = thrust::identity<int>(); 4101 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4102 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4103 #endif 4104 stat = cusparseXcoo2csr(Ccusp->handle, 4105 Ccoo->data().get(), 4106 c->nz, 4107 m, 4108 Ccsr->row_offsets->data().get(), 4109 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4110 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4111 delete wPerm; 4112 delete Acoo; 4113 delete Bcoo; 4114 delete Ccoo; 4115 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4116 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4117 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4118 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4119 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4120 #endif 4121 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4122 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4123 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4124 CsrMatrix *CcsrT = new CsrMatrix; 4125 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4126 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4127 4128 (*C)->form_explicit_transpose = PETSC_TRUE; 4129 (*C)->transupdated = PETSC_TRUE; 4130 Ccusp->rowoffsets_gpu = NULL; 4131 CmatT->cprowIndices = NULL; 4132 CmatT->mat = CcsrT; 4133 CcsrT->num_rows = n; 4134 CcsrT->num_cols = m; 4135 CcsrT->num_entries = c->nz; 4136 4137 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4138 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4139 CcsrT->values = new THRUSTARRAY(c->nz); 4140 4141 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4142 auto rT = CcsrT->row_offsets->begin(); 4143 if (AT) { 4144 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4145 thrust::advance(rT,-1); 4146 } 4147 if (BT) { 4148 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4149 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4150 thrust::copy(titb,tite,rT); 4151 } 4152 auto cT = CcsrT->column_indices->begin(); 4153 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4154 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4155 auto vT = CcsrT->values->begin(); 4156 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4157 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4158 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4159 4160 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4161 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4162 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4163 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4164 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4165 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4166 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4167 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4168 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4169 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4170 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4171 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4172 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4173 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4174 #endif 4175 Ccusp->matTranspose = CmatT; 4176 } 4177 } 4178 4179 c->singlemalloc = PETSC_FALSE; 4180 c->free_a = PETSC_TRUE; 4181 c->free_ij = PETSC_TRUE; 4182 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4183 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4184 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4185 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4186 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4187 ii = *Ccsr->row_offsets; 4188 jj = *Ccsr->column_indices; 4189 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4190 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4191 } else { 4192 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4193 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4194 } 4195 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4196 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4197 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4198 c->maxnz = c->nz; 4199 c->nonzerorowcnt = 0; 4200 c->rmax = 0; 4201 for (i = 0; i < m; i++) { 4202 const PetscInt nn = c->i[i+1] - c->i[i]; 4203 c->ilen[i] = c->imax[i] = nn; 4204 c->nonzerorowcnt += (PetscInt)!!nn; 4205 c->rmax = PetscMax(c->rmax,nn); 4206 } 4207 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4208 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4209 (*C)->nonzerostate++; 4210 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4211 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4212 Ccusp->nonzerostate = (*C)->nonzerostate; 4213 (*C)->preallocated = PETSC_TRUE; 4214 } else { 4215 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4216 c = (Mat_SeqAIJ*)(*C)->data; 4217 if (c->nz) { 4218 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4219 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4220 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4221 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4222 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4223 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4224 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4225 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4226 Acsr = (CsrMatrix*)Acusp->mat->mat; 4227 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4228 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4229 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4230 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4231 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4232 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4233 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4234 auto pmid = Ccusp->cooPerm->begin(); 4235 thrust::advance(pmid,Acsr->num_entries); 4236 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4237 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4238 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4239 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4240 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4241 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4242 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4243 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4244 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4245 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4246 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4247 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4248 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4249 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4250 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4251 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4252 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4253 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4254 auto vT = CcsrT->values->begin(); 4255 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4256 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4257 (*C)->transupdated = PETSC_TRUE; 4258 } 4259 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4260 } 4261 } 4262 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4263 (*C)->assembled = PETSC_TRUE; 4264 (*C)->was_assembled = PETSC_FALSE; 4265 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4266 PetscFunctionReturn(0); 4267 } 4268 4269 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4270 { 4271 PetscErrorCode ierr; 4272 bool dmem; 4273 const PetscScalar *av; 4274 cudaError_t cerr; 4275 4276 PetscFunctionBegin; 4277 dmem = isCudaMem(v); 4278 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4279 if (n && idx) { 4280 THRUSTINTARRAY widx(n); 4281 widx.assign(idx,idx+n); 4282 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4283 4284 THRUSTARRAY *w = NULL; 4285 thrust::device_ptr<PetscScalar> dv; 4286 if (dmem) { 4287 dv = thrust::device_pointer_cast(v); 4288 } else { 4289 w = new THRUSTARRAY(n); 4290 dv = w->data(); 4291 } 4292 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4293 4294 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4295 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4296 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4297 if (w) { 4298 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4299 } 4300 delete w; 4301 } else { 4302 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4303 } 4304 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4305 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4306 PetscFunctionReturn(0); 4307 } 4308