1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/adjacent_difference.h> 16 #include <thrust/async/for_each.h> 17 #include <thrust/iterator/constant_iterator.h> 18 #include <thrust/remove.h> 19 #include <thrust/sort.h> 20 #include <thrust/unique.h> 21 22 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 24 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 25 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 26 27 typedef enum { 28 CUSPARSE_MV_ALG_DEFAULT = 0, 29 CUSPARSE_COOMV_ALG = 1, 30 CUSPARSE_CSRMV_ALG1 = 2, 31 CUSPARSE_CSRMV_ALG2 = 3 32 } cusparseSpMVAlg_t; 33 34 typedef enum { 35 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 36 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 37 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 38 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 39 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 40 CUSPARSE_SPMM_ALG_DEFAULT = 0, 41 CUSPARSE_SPMM_COO_ALG1 = 1, 42 CUSPARSE_SPMM_COO_ALG2 = 2, 43 CUSPARSE_SPMM_COO_ALG3 = 3, 44 CUSPARSE_SPMM_COO_ALG4 = 5, 45 CUSPARSE_SPMM_CSR_ALG1 = 4, 46 CUSPARSE_SPMM_CSR_ALG2 = 6, 47 } cusparseSpMMAlg_t; 48 49 typedef enum { 50 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 51 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 52 } cusparseCsr2CscAlg_t; 53 */ 54 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 55 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 56 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 57 #endif 58 59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62 63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 66 67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 81 82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 87 88 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 89 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 90 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 91 92 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 93 94 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 95 { 96 cusparseStatus_t stat; 97 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 98 99 PetscFunctionBegin; 100 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 101 cusparsestruct->stream = stream; 102 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 103 PetscFunctionReturn(0); 104 } 105 106 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 107 { 108 cusparseStatus_t stat; 109 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 110 111 PetscFunctionBegin; 112 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 113 if (cusparsestruct->handle != handle) { 114 if (cusparsestruct->handle) { 115 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 116 } 117 cusparsestruct->handle = handle; 118 } 119 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 120 PetscFunctionReturn(0); 121 } 122 123 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 124 { 125 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 126 PetscBool flg; 127 PetscErrorCode ierr; 128 129 PetscFunctionBegin; 130 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 131 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 132 if (cusparsestruct->handle) cusparsestruct->handle = 0; 133 PetscFunctionReturn(0); 134 } 135 136 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 137 { 138 PetscFunctionBegin; 139 *type = MATSOLVERCUSPARSE; 140 PetscFunctionReturn(0); 141 } 142 143 /*MC 144 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 145 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 146 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 147 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 148 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 149 algorithms are not recommended. This class does NOT support direct solver operations. 150 151 Level: beginner 152 153 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 154 M*/ 155 156 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 157 { 158 PetscErrorCode ierr; 159 PetscInt n = A->rmap->n; 160 161 PetscFunctionBegin; 162 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 163 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 164 (*B)->factortype = ftype; 165 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 166 167 if (A->boundtocpu && A->bindingpropagates) { ierr = MatBindToCPU(*B,PETSC_TRUE);CHKERRQ(ierr); } 168 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 169 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 170 if (!A->boundtocpu) { 171 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 172 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 173 } else { 174 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 175 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 176 } 177 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 178 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 179 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 180 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 181 if (!A->boundtocpu) { 182 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 183 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 184 } else { 185 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 186 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 187 } 188 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 189 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 190 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 191 192 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 193 (*B)->canuseordering = PETSC_TRUE; 194 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 195 PetscFunctionReturn(0); 196 } 197 198 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 199 { 200 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 201 202 PetscFunctionBegin; 203 switch (op) { 204 case MAT_CUSPARSE_MULT: 205 cusparsestruct->format = format; 206 break; 207 case MAT_CUSPARSE_ALL: 208 cusparsestruct->format = format; 209 break; 210 default: 211 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 212 } 213 PetscFunctionReturn(0); 214 } 215 216 /*@ 217 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 218 operation. Only the MatMult operation can use different GPU storage formats 219 for MPIAIJCUSPARSE matrices. 220 Not Collective 221 222 Input Parameters: 223 + A - Matrix of type SEQAIJCUSPARSE 224 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 225 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 226 227 Output Parameter: 228 229 Level: intermediate 230 231 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 232 @*/ 233 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 234 { 235 PetscErrorCode ierr; 236 237 PetscFunctionBegin; 238 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 239 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 240 PetscFunctionReturn(0); 241 } 242 243 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu) 244 { 245 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 246 247 PetscFunctionBegin; 248 cusparsestruct->use_cpu_solve = use_cpu; 249 PetscFunctionReturn(0); 250 } 251 252 /*@ 253 MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve. 254 255 Input Parameters: 256 + A - Matrix of type SEQAIJCUSPARSE 257 - use_cpu - set flag for using the built-in CPU MatSolve 258 259 Output Parameter: 260 261 Notes: 262 The cuSparse LU solver currently computes the factors with the built-in CPU method 263 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 264 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 265 266 Level: intermediate 267 268 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 269 @*/ 270 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu) 271 { 272 PetscErrorCode ierr; 273 274 PetscFunctionBegin; 275 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 276 ierr = PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));CHKERRQ(ierr); 277 PetscFunctionReturn(0); 278 } 279 280 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 281 { 282 PetscErrorCode ierr; 283 284 PetscFunctionBegin; 285 switch (op) { 286 case MAT_FORM_EXPLICIT_TRANSPOSE: 287 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 288 if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 289 A->form_explicit_transpose = flg; 290 break; 291 default: 292 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 293 break; 294 } 295 PetscFunctionReturn(0); 296 } 297 298 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 299 300 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 301 { 302 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 303 IS isrow = b->row,iscol = b->col; 304 PetscBool row_identity,col_identity; 305 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr; 306 PetscErrorCode ierr; 307 308 PetscFunctionBegin; 309 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 310 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 311 B->offloadmask = PETSC_OFFLOAD_CPU; 312 /* determine which version of MatSolve needs to be used. */ 313 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 314 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 315 if (row_identity && col_identity) { 316 if (!cusparsestruct->use_cpu_solve) { 317 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 318 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 319 } 320 B->ops->matsolve = NULL; 321 B->ops->matsolvetranspose = NULL; 322 } else { 323 if (!cusparsestruct->use_cpu_solve) { 324 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 325 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 326 } 327 B->ops->matsolve = NULL; 328 B->ops->matsolvetranspose = NULL; 329 } 330 331 /* get the triangular factors */ 332 if (!cusparsestruct->use_cpu_solve) { 333 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 334 } 335 PetscFunctionReturn(0); 336 } 337 338 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 339 { 340 PetscErrorCode ierr; 341 MatCUSPARSEStorageFormat format; 342 PetscBool flg; 343 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 344 345 PetscFunctionBegin; 346 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 347 if (A->factortype == MAT_FACTOR_NONE) { 348 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 349 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 350 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 351 352 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 353 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 354 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 355 ierr = PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg);CHKERRQ(ierr); 356 if (flg) {ierr = MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve);CHKERRQ(ierr);} 357 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 358 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 359 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 360 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 361 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 362 if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 363 #else 364 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 365 #endif 366 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 367 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 368 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 369 370 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 371 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 372 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 373 #endif 374 } 375 ierr = PetscOptionsTail();CHKERRQ(ierr); 376 PetscFunctionReturn(0); 377 } 378 379 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 380 { 381 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 382 PetscErrorCode ierr; 383 384 PetscFunctionBegin; 385 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 386 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 387 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 388 PetscFunctionReturn(0); 389 } 390 391 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 392 { 393 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 394 PetscErrorCode ierr; 395 396 PetscFunctionBegin; 397 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 398 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 399 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 400 PetscFunctionReturn(0); 401 } 402 403 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 404 { 405 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 406 PetscErrorCode ierr; 407 408 PetscFunctionBegin; 409 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 410 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 411 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 412 PetscFunctionReturn(0); 413 } 414 415 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 416 { 417 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 418 PetscErrorCode ierr; 419 420 PetscFunctionBegin; 421 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 422 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 423 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 424 PetscFunctionReturn(0); 425 } 426 427 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 428 { 429 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 430 PetscInt n = A->rmap->n; 431 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 432 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 433 cusparseStatus_t stat; 434 const PetscInt *ai = a->i,*aj = a->j,*vi; 435 const MatScalar *aa = a->a,*v; 436 PetscInt *AiLo, *AjLo; 437 PetscInt i,nz, nzLower, offset, rowOffset; 438 PetscErrorCode ierr; 439 cudaError_t cerr; 440 441 PetscFunctionBegin; 442 if (!n) PetscFunctionReturn(0); 443 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 444 try { 445 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 446 nzLower=n+ai[n]-ai[1]; 447 if (!loTriFactor) { 448 PetscScalar *AALo; 449 450 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 451 452 /* Allocate Space for the lower triangular matrix */ 453 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 454 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 455 456 /* Fill the lower triangular matrix */ 457 AiLo[0] = (PetscInt) 0; 458 AiLo[n] = nzLower; 459 AjLo[0] = (PetscInt) 0; 460 AALo[0] = (MatScalar) 1.0; 461 v = aa; 462 vi = aj; 463 offset = 1; 464 rowOffset= 1; 465 for (i=1; i<n; i++) { 466 nz = ai[i+1] - ai[i]; 467 /* additional 1 for the term on the diagonal */ 468 AiLo[i] = rowOffset; 469 rowOffset += nz+1; 470 471 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 472 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 473 474 offset += nz; 475 AjLo[offset] = (PetscInt) i; 476 AALo[offset] = (MatScalar) 1.0; 477 offset += 1; 478 479 v += nz; 480 vi += nz; 481 } 482 483 /* allocate space for the triangular factor information */ 484 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 485 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 486 /* Create the matrix description */ 487 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 488 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 489 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 490 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 491 #else 492 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 493 #endif 494 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 495 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 496 497 /* set the operation */ 498 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 499 500 /* set the matrix */ 501 loTriFactor->csrMat = new CsrMatrix; 502 loTriFactor->csrMat->num_rows = n; 503 loTriFactor->csrMat->num_cols = n; 504 loTriFactor->csrMat->num_entries = nzLower; 505 506 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 507 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 508 509 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 510 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 511 512 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 513 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 514 515 /* Create the solve analysis information */ 516 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 517 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 518 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 519 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 520 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 521 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 522 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 523 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 524 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 525 #endif 526 527 /* perform the solve analysis */ 528 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 529 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 530 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 531 loTriFactor->csrMat->column_indices->data().get(), 532 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 533 loTriFactor->solveInfo, 534 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 535 #else 536 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 537 #endif 538 cerr = WaitForCUDA();CHKERRCUDA(cerr); 539 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 540 541 /* assign the pointer */ 542 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 543 loTriFactor->AA_h = AALo; 544 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 545 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 546 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 547 } else { /* update values only */ 548 if (!loTriFactor->AA_h) { 549 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 550 } 551 /* Fill the lower triangular matrix */ 552 loTriFactor->AA_h[0] = 1.0; 553 v = aa; 554 vi = aj; 555 offset = 1; 556 for (i=1; i<n; i++) { 557 nz = ai[i+1] - ai[i]; 558 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 559 offset += nz; 560 loTriFactor->AA_h[offset] = 1.0; 561 offset += 1; 562 v += nz; 563 } 564 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 565 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 566 } 567 } catch(char *ex) { 568 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 569 } 570 } 571 PetscFunctionReturn(0); 572 } 573 574 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 575 { 576 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 577 PetscInt n = A->rmap->n; 578 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 579 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 580 cusparseStatus_t stat; 581 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 582 const MatScalar *aa = a->a,*v; 583 PetscInt *AiUp, *AjUp; 584 PetscInt i,nz, nzUpper, offset; 585 PetscErrorCode ierr; 586 cudaError_t cerr; 587 588 PetscFunctionBegin; 589 if (!n) PetscFunctionReturn(0); 590 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 591 try { 592 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 593 nzUpper = adiag[0]-adiag[n]; 594 if (!upTriFactor) { 595 PetscScalar *AAUp; 596 597 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 598 599 /* Allocate Space for the upper triangular matrix */ 600 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 601 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 602 603 /* Fill the upper triangular matrix */ 604 AiUp[0]=(PetscInt) 0; 605 AiUp[n]=nzUpper; 606 offset = nzUpper; 607 for (i=n-1; i>=0; i--) { 608 v = aa + adiag[i+1] + 1; 609 vi = aj + adiag[i+1] + 1; 610 611 /* number of elements NOT on the diagonal */ 612 nz = adiag[i] - adiag[i+1]-1; 613 614 /* decrement the offset */ 615 offset -= (nz+1); 616 617 /* first, set the diagonal elements */ 618 AjUp[offset] = (PetscInt) i; 619 AAUp[offset] = (MatScalar)1./v[nz]; 620 AiUp[i] = AiUp[i+1] - (nz+1); 621 622 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 623 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 624 } 625 626 /* allocate space for the triangular factor information */ 627 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 628 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 629 630 /* Create the matrix description */ 631 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 632 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 633 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 634 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 635 #else 636 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 637 #endif 638 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 639 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 640 641 /* set the operation */ 642 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 643 644 /* set the matrix */ 645 upTriFactor->csrMat = new CsrMatrix; 646 upTriFactor->csrMat->num_rows = n; 647 upTriFactor->csrMat->num_cols = n; 648 upTriFactor->csrMat->num_entries = nzUpper; 649 650 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 651 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 652 653 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 654 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 655 656 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 657 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 658 659 /* Create the solve analysis information */ 660 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 661 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 662 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 663 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 664 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 665 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 666 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 667 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 668 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 669 #endif 670 671 /* perform the solve analysis */ 672 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 673 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 674 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 675 upTriFactor->csrMat->column_indices->data().get(), 676 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 677 upTriFactor->solveInfo, 678 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 679 #else 680 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 681 #endif 682 cerr = WaitForCUDA();CHKERRCUDA(cerr); 683 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 684 685 /* assign the pointer */ 686 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 687 upTriFactor->AA_h = AAUp; 688 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 689 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 690 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 691 } else { 692 if (!upTriFactor->AA_h) { 693 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 694 } 695 /* Fill the upper triangular matrix */ 696 offset = nzUpper; 697 for (i=n-1; i>=0; i--) { 698 v = aa + adiag[i+1] + 1; 699 700 /* number of elements NOT on the diagonal */ 701 nz = adiag[i] - adiag[i+1]-1; 702 703 /* decrement the offset */ 704 offset -= (nz+1); 705 706 /* first, set the diagonal elements */ 707 upTriFactor->AA_h[offset] = 1./v[nz]; 708 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 709 } 710 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 711 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 712 } 713 } catch(char *ex) { 714 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 715 } 716 } 717 PetscFunctionReturn(0); 718 } 719 720 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 721 { 722 PetscErrorCode ierr; 723 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 724 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 725 IS isrow = a->row,iscol = a->icol; 726 PetscBool row_identity,col_identity; 727 PetscInt n = A->rmap->n; 728 729 PetscFunctionBegin; 730 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 731 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 732 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 733 734 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 735 cusparseTriFactors->nnz=a->nz; 736 737 A->offloadmask = PETSC_OFFLOAD_BOTH; 738 /* lower triangular indices */ 739 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 740 if (!row_identity && !cusparseTriFactors->rpermIndices) { 741 const PetscInt *r; 742 743 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 744 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 745 cusparseTriFactors->rpermIndices->assign(r, r+n); 746 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 747 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 748 } 749 750 /* upper triangular indices */ 751 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 752 if (!col_identity && !cusparseTriFactors->cpermIndices) { 753 const PetscInt *c; 754 755 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 756 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 757 cusparseTriFactors->cpermIndices->assign(c, c+n); 758 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 759 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 760 } 761 PetscFunctionReturn(0); 762 } 763 764 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 765 { 766 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 767 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 768 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 769 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 770 cusparseStatus_t stat; 771 PetscErrorCode ierr; 772 cudaError_t cerr; 773 PetscInt *AiUp, *AjUp; 774 PetscScalar *AAUp; 775 PetscScalar *AALo; 776 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 777 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 778 const PetscInt *ai = b->i,*aj = b->j,*vj; 779 const MatScalar *aa = b->a,*v; 780 781 PetscFunctionBegin; 782 if (!n) PetscFunctionReturn(0); 783 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 784 try { 785 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 786 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 787 if (!upTriFactor && !loTriFactor) { 788 /* Allocate Space for the upper triangular matrix */ 789 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 790 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 791 792 /* Fill the upper triangular matrix */ 793 AiUp[0]=(PetscInt) 0; 794 AiUp[n]=nzUpper; 795 offset = 0; 796 for (i=0; i<n; i++) { 797 /* set the pointers */ 798 v = aa + ai[i]; 799 vj = aj + ai[i]; 800 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 801 802 /* first, set the diagonal elements */ 803 AjUp[offset] = (PetscInt) i; 804 AAUp[offset] = (MatScalar)1.0/v[nz]; 805 AiUp[i] = offset; 806 AALo[offset] = (MatScalar)1.0/v[nz]; 807 808 offset+=1; 809 if (nz>0) { 810 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 811 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 812 for (j=offset; j<offset+nz; j++) { 813 AAUp[j] = -AAUp[j]; 814 AALo[j] = AAUp[j]/v[nz]; 815 } 816 offset+=nz; 817 } 818 } 819 820 /* allocate space for the triangular factor information */ 821 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 822 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 823 824 /* Create the matrix description */ 825 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 826 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 827 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 828 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 829 #else 830 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 831 #endif 832 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 833 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 834 835 /* set the matrix */ 836 upTriFactor->csrMat = new CsrMatrix; 837 upTriFactor->csrMat->num_rows = A->rmap->n; 838 upTriFactor->csrMat->num_cols = A->cmap->n; 839 upTriFactor->csrMat->num_entries = a->nz; 840 841 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 842 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 843 844 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 845 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 846 847 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 848 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 849 850 /* set the operation */ 851 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 852 853 /* Create the solve analysis information */ 854 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 855 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 856 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 857 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 858 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 859 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 860 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 861 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 862 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 863 #endif 864 865 /* perform the solve analysis */ 866 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 867 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 868 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 869 upTriFactor->csrMat->column_indices->data().get(), 870 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 871 upTriFactor->solveInfo, 872 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 873 #else 874 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 875 #endif 876 cerr = WaitForCUDA();CHKERRCUDA(cerr); 877 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 878 879 /* assign the pointer */ 880 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 881 882 /* allocate space for the triangular factor information */ 883 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 884 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 885 886 /* Create the matrix description */ 887 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 888 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 889 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 890 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 891 #else 892 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 893 #endif 894 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 895 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 896 897 /* set the operation */ 898 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 899 900 /* set the matrix */ 901 loTriFactor->csrMat = new CsrMatrix; 902 loTriFactor->csrMat->num_rows = A->rmap->n; 903 loTriFactor->csrMat->num_cols = A->cmap->n; 904 loTriFactor->csrMat->num_entries = a->nz; 905 906 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 907 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 908 909 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 910 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 911 912 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 913 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 914 915 /* Create the solve analysis information */ 916 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 917 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 918 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 919 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 920 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 921 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 922 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 923 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 924 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 925 #endif 926 927 /* perform the solve analysis */ 928 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 929 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 930 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 931 loTriFactor->csrMat->column_indices->data().get(), 932 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 933 loTriFactor->solveInfo, 934 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 935 #else 936 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 937 #endif 938 cerr = WaitForCUDA();CHKERRCUDA(cerr); 939 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 940 941 /* assign the pointer */ 942 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 943 944 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 945 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 946 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 947 } else { 948 /* Fill the upper triangular matrix */ 949 offset = 0; 950 for (i=0; i<n; i++) { 951 /* set the pointers */ 952 v = aa + ai[i]; 953 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 954 955 /* first, set the diagonal elements */ 956 AAUp[offset] = 1.0/v[nz]; 957 AALo[offset] = 1.0/v[nz]; 958 959 offset+=1; 960 if (nz>0) { 961 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 962 for (j=offset; j<offset+nz; j++) { 963 AAUp[j] = -AAUp[j]; 964 AALo[j] = AAUp[j]/v[nz]; 965 } 966 offset+=nz; 967 } 968 } 969 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 970 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 971 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 972 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 973 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 974 } 975 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 976 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 977 } catch(char *ex) { 978 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 979 } 980 } 981 PetscFunctionReturn(0); 982 } 983 984 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 985 { 986 PetscErrorCode ierr; 987 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 988 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 989 IS ip = a->row; 990 PetscBool perm_identity; 991 PetscInt n = A->rmap->n; 992 993 PetscFunctionBegin; 994 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 995 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 996 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 997 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 998 999 A->offloadmask = PETSC_OFFLOAD_BOTH; 1000 1001 /* lower triangular indices */ 1002 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 1003 if (!perm_identity) { 1004 IS iip; 1005 const PetscInt *irip,*rip; 1006 1007 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 1008 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 1009 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 1010 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 1011 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 1012 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 1013 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 1014 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 1015 ierr = ISDestroy(&iip);CHKERRQ(ierr); 1016 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 1017 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 1018 } 1019 PetscFunctionReturn(0); 1020 } 1021 1022 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 1023 { 1024 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 1025 IS ip = b->row; 1026 PetscBool perm_identity; 1027 PetscErrorCode ierr; 1028 1029 PetscFunctionBegin; 1030 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1031 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 1032 B->offloadmask = PETSC_OFFLOAD_CPU; 1033 /* determine which version of MatSolve needs to be used. */ 1034 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 1035 if (perm_identity) { 1036 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1037 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 1038 B->ops->matsolve = NULL; 1039 B->ops->matsolvetranspose = NULL; 1040 } else { 1041 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1042 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 1043 B->ops->matsolve = NULL; 1044 B->ops->matsolvetranspose = NULL; 1045 } 1046 1047 /* get the triangular factors */ 1048 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 1049 PetscFunctionReturn(0); 1050 } 1051 1052 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1053 { 1054 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1055 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1056 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1057 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1058 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1059 cusparseStatus_t stat; 1060 cusparseIndexBase_t indexBase; 1061 cusparseMatrixType_t matrixType; 1062 cusparseFillMode_t fillMode; 1063 cusparseDiagType_t diagType; 1064 cudaError_t cerr; 1065 PetscErrorCode ierr; 1066 1067 PetscFunctionBegin; 1068 /* allocate space for the transpose of the lower triangular factor */ 1069 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1070 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1071 1072 /* set the matrix descriptors of the lower triangular factor */ 1073 matrixType = cusparseGetMatType(loTriFactor->descr); 1074 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1075 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1076 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1077 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1078 1079 /* Create the matrix description */ 1080 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1081 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1082 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1083 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1084 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1085 1086 /* set the operation */ 1087 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1088 1089 /* allocate GPU space for the CSC of the lower triangular factor*/ 1090 loTriFactorT->csrMat = new CsrMatrix; 1091 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1092 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1093 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1094 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1095 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1096 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1097 1098 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1099 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1100 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1101 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1102 loTriFactor->csrMat->values->data().get(), 1103 loTriFactor->csrMat->row_offsets->data().get(), 1104 loTriFactor->csrMat->column_indices->data().get(), 1105 loTriFactorT->csrMat->values->data().get(), 1106 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1107 CUSPARSE_ACTION_NUMERIC,indexBase, 1108 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1109 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1110 #endif 1111 1112 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1113 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1114 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1115 loTriFactor->csrMat->values->data().get(), 1116 loTriFactor->csrMat->row_offsets->data().get(), 1117 loTriFactor->csrMat->column_indices->data().get(), 1118 loTriFactorT->csrMat->values->data().get(), 1119 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1120 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1121 CUSPARSE_ACTION_NUMERIC, indexBase, 1122 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1123 #else 1124 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1125 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1126 #endif 1127 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1128 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1129 1130 /* Create the solve analysis information */ 1131 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1132 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1133 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1134 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1135 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1136 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1137 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1138 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1139 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1140 #endif 1141 1142 /* perform the solve analysis */ 1143 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1144 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1145 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1146 loTriFactorT->csrMat->column_indices->data().get(), 1147 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1148 loTriFactorT->solveInfo, 1149 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1150 #else 1151 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1152 #endif 1153 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1154 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1155 1156 /* assign the pointer */ 1157 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1158 1159 /*********************************************/ 1160 /* Now the Transpose of the Upper Tri Factor */ 1161 /*********************************************/ 1162 1163 /* allocate space for the transpose of the upper triangular factor */ 1164 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1165 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1166 1167 /* set the matrix descriptors of the upper triangular factor */ 1168 matrixType = cusparseGetMatType(upTriFactor->descr); 1169 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1170 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1171 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1172 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1173 1174 /* Create the matrix description */ 1175 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1176 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1177 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1178 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1179 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1180 1181 /* set the operation */ 1182 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1183 1184 /* allocate GPU space for the CSC of the upper triangular factor*/ 1185 upTriFactorT->csrMat = new CsrMatrix; 1186 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1187 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1188 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1189 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1190 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1191 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1192 1193 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1194 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1195 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1196 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1197 upTriFactor->csrMat->values->data().get(), 1198 upTriFactor->csrMat->row_offsets->data().get(), 1199 upTriFactor->csrMat->column_indices->data().get(), 1200 upTriFactorT->csrMat->values->data().get(), 1201 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1202 CUSPARSE_ACTION_NUMERIC,indexBase, 1203 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1204 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1205 #endif 1206 1207 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1208 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1209 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1210 upTriFactor->csrMat->values->data().get(), 1211 upTriFactor->csrMat->row_offsets->data().get(), 1212 upTriFactor->csrMat->column_indices->data().get(), 1213 upTriFactorT->csrMat->values->data().get(), 1214 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1215 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1216 CUSPARSE_ACTION_NUMERIC, indexBase, 1217 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1218 #else 1219 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1220 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1221 #endif 1222 1223 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1224 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1225 1226 /* Create the solve analysis information */ 1227 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1228 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1229 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1230 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1231 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1232 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1233 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1234 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1235 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1236 #endif 1237 1238 /* perform the solve analysis */ 1239 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1240 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1241 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1242 upTriFactorT->csrMat->column_indices->data().get(), 1243 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1244 upTriFactorT->solveInfo, 1245 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1246 #else 1247 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1248 #endif 1249 1250 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1251 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1252 1253 /* assign the pointer */ 1254 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1255 PetscFunctionReturn(0); 1256 } 1257 1258 struct PetscScalarToPetscInt 1259 { 1260 __host__ __device__ 1261 PetscInt operator()(PetscScalar s) 1262 { 1263 return (PetscInt)PetscRealPart(s); 1264 } 1265 }; 1266 1267 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1268 { 1269 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1270 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1271 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1272 cusparseStatus_t stat; 1273 cusparseIndexBase_t indexBase; 1274 cudaError_t err; 1275 PetscErrorCode ierr; 1276 1277 PetscFunctionBegin; 1278 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1279 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1280 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1281 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1282 if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1283 if (A->transupdated) PetscFunctionReturn(0); 1284 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1285 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1286 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1287 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1288 } 1289 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1290 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1291 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1292 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1293 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1294 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1295 1296 /* set alpha and beta */ 1297 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1298 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1299 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1300 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1301 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1302 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1303 1304 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1305 CsrMatrix *matrixT = new CsrMatrix; 1306 matstructT->mat = matrixT; 1307 matrixT->num_rows = A->cmap->n; 1308 matrixT->num_cols = A->rmap->n; 1309 matrixT->num_entries = a->nz; 1310 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1311 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1312 matrixT->values = new THRUSTARRAY(a->nz); 1313 1314 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1315 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1316 1317 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1318 #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1319 stat = cusparseCreateCsr(&matstructT->matDescr, 1320 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1321 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1322 matrixT->values->data().get(), 1323 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1324 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1325 #else 1326 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1327 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1328 1329 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1330 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1331 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1332 */ 1333 if (matrixT->num_entries) { 1334 stat = cusparseCreateCsr(&matstructT->matDescr, 1335 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1336 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1337 matrixT->values->data().get(), 1338 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 1339 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1340 1341 } else { 1342 matstructT->matDescr = NULL; 1343 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1344 } 1345 #endif 1346 #endif 1347 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1348 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1349 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1350 #else 1351 CsrMatrix *temp = new CsrMatrix; 1352 CsrMatrix *tempT = new CsrMatrix; 1353 /* First convert HYB to CSR */ 1354 temp->num_rows = A->rmap->n; 1355 temp->num_cols = A->cmap->n; 1356 temp->num_entries = a->nz; 1357 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1358 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1359 temp->values = new THRUSTARRAY(a->nz); 1360 1361 stat = cusparse_hyb2csr(cusparsestruct->handle, 1362 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1363 temp->values->data().get(), 1364 temp->row_offsets->data().get(), 1365 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1366 1367 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1368 tempT->num_rows = A->rmap->n; 1369 tempT->num_cols = A->cmap->n; 1370 tempT->num_entries = a->nz; 1371 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1372 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1373 tempT->values = new THRUSTARRAY(a->nz); 1374 1375 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1376 temp->num_cols, temp->num_entries, 1377 temp->values->data().get(), 1378 temp->row_offsets->data().get(), 1379 temp->column_indices->data().get(), 1380 tempT->values->data().get(), 1381 tempT->column_indices->data().get(), 1382 tempT->row_offsets->data().get(), 1383 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1384 1385 /* Last, convert CSC to HYB */ 1386 cusparseHybMat_t hybMat; 1387 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1388 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1389 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1390 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1391 matstructT->descr, tempT->values->data().get(), 1392 tempT->row_offsets->data().get(), 1393 tempT->column_indices->data().get(), 1394 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1395 1396 /* assign the pointer */ 1397 matstructT->mat = hybMat; 1398 A->transupdated = PETSC_TRUE; 1399 /* delete temporaries */ 1400 if (tempT) { 1401 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1402 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1403 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1404 delete (CsrMatrix*) tempT; 1405 } 1406 if (temp) { 1407 if (temp->values) delete (THRUSTARRAY*) temp->values; 1408 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1409 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1410 delete (CsrMatrix*) temp; 1411 } 1412 #endif 1413 } 1414 } 1415 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1416 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1417 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1418 if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1419 if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1420 if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1421 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1422 if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1423 if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1424 if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1425 if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1426 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1427 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1428 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1429 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1430 } 1431 if (!cusparsestruct->csr2csc_i) { 1432 THRUSTARRAY csr2csc_a(matrix->num_entries); 1433 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1434 1435 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1436 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1437 void *csr2cscBuffer; 1438 size_t csr2cscBufferSize; 1439 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1440 A->cmap->n, matrix->num_entries, 1441 matrix->values->data().get(), 1442 cusparsestruct->rowoffsets_gpu->data().get(), 1443 matrix->column_indices->data().get(), 1444 matrixT->values->data().get(), 1445 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1446 CUSPARSE_ACTION_NUMERIC,indexBase, 1447 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1448 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1449 #endif 1450 1451 if (matrix->num_entries) { 1452 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1453 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1454 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1455 1456 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1457 should be filled with indexBase. So I just take a shortcut here. 1458 */ 1459 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1460 A->cmap->n,matrix->num_entries, 1461 csr2csc_a.data().get(), 1462 cusparsestruct->rowoffsets_gpu->data().get(), 1463 matrix->column_indices->data().get(), 1464 matrixT->values->data().get(), 1465 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1466 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1467 CUSPARSE_ACTION_NUMERIC,indexBase, 1468 cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1469 #else 1470 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1471 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1472 #endif 1473 } else { 1474 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1475 } 1476 1477 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1478 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1479 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1480 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1481 #endif 1482 } 1483 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1484 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1485 matrixT->values->begin())); 1486 } 1487 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1488 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1489 /* the compressed row indices is not used for matTranspose */ 1490 matstructT->cprowIndices = NULL; 1491 /* assign the pointer */ 1492 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1493 A->transupdated = PETSC_TRUE; 1494 PetscFunctionReturn(0); 1495 } 1496 1497 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1498 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1499 { 1500 PetscInt n = xx->map->n; 1501 const PetscScalar *barray; 1502 PetscScalar *xarray; 1503 thrust::device_ptr<const PetscScalar> bGPU; 1504 thrust::device_ptr<PetscScalar> xGPU; 1505 cusparseStatus_t stat; 1506 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1507 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1508 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1509 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1510 PetscErrorCode ierr; 1511 1512 PetscFunctionBegin; 1513 /* Analyze the matrix and create the transpose ... on the fly */ 1514 if (!loTriFactorT && !upTriFactorT) { 1515 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1516 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1517 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1518 } 1519 1520 /* Get the GPU pointers */ 1521 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1522 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1523 xGPU = thrust::device_pointer_cast(xarray); 1524 bGPU = thrust::device_pointer_cast(barray); 1525 1526 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1527 /* First, reorder with the row permutation */ 1528 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1529 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1530 xGPU); 1531 1532 /* First, solve U */ 1533 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1534 upTriFactorT->csrMat->num_rows, 1535 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1536 upTriFactorT->csrMat->num_entries, 1537 #endif 1538 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1539 upTriFactorT->csrMat->values->data().get(), 1540 upTriFactorT->csrMat->row_offsets->data().get(), 1541 upTriFactorT->csrMat->column_indices->data().get(), 1542 upTriFactorT->solveInfo, 1543 xarray, 1544 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1545 tempGPU->data().get(), 1546 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1547 #else 1548 tempGPU->data().get());CHKERRCUSPARSE(stat); 1549 #endif 1550 1551 /* Then, solve L */ 1552 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1553 loTriFactorT->csrMat->num_rows, 1554 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1555 loTriFactorT->csrMat->num_entries, 1556 #endif 1557 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1558 loTriFactorT->csrMat->values->data().get(), 1559 loTriFactorT->csrMat->row_offsets->data().get(), 1560 loTriFactorT->csrMat->column_indices->data().get(), 1561 loTriFactorT->solveInfo, 1562 tempGPU->data().get(), 1563 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1564 xarray, 1565 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1566 #else 1567 xarray);CHKERRCUSPARSE(stat); 1568 #endif 1569 1570 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1571 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1572 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1573 tempGPU->begin()); 1574 1575 /* Copy the temporary to the full solution. */ 1576 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1577 1578 /* restore */ 1579 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1580 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1581 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1582 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1583 PetscFunctionReturn(0); 1584 } 1585 1586 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1587 { 1588 const PetscScalar *barray; 1589 PetscScalar *xarray; 1590 cusparseStatus_t stat; 1591 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1592 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1593 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1594 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1595 PetscErrorCode ierr; 1596 1597 PetscFunctionBegin; 1598 /* Analyze the matrix and create the transpose ... on the fly */ 1599 if (!loTriFactorT && !upTriFactorT) { 1600 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1601 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1602 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1603 } 1604 1605 /* Get the GPU pointers */ 1606 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1607 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1608 1609 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1610 /* First, solve U */ 1611 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1612 upTriFactorT->csrMat->num_rows, 1613 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1614 upTriFactorT->csrMat->num_entries, 1615 #endif 1616 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1617 upTriFactorT->csrMat->values->data().get(), 1618 upTriFactorT->csrMat->row_offsets->data().get(), 1619 upTriFactorT->csrMat->column_indices->data().get(), 1620 upTriFactorT->solveInfo, 1621 barray, 1622 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1623 tempGPU->data().get(), 1624 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1625 #else 1626 tempGPU->data().get());CHKERRCUSPARSE(stat); 1627 #endif 1628 1629 /* Then, solve L */ 1630 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1631 loTriFactorT->csrMat->num_rows, 1632 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1633 loTriFactorT->csrMat->num_entries, 1634 #endif 1635 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1636 loTriFactorT->csrMat->values->data().get(), 1637 loTriFactorT->csrMat->row_offsets->data().get(), 1638 loTriFactorT->csrMat->column_indices->data().get(), 1639 loTriFactorT->solveInfo, 1640 tempGPU->data().get(), 1641 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1642 xarray, 1643 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1644 #else 1645 xarray);CHKERRCUSPARSE(stat); 1646 #endif 1647 1648 /* restore */ 1649 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1650 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1651 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1652 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1653 PetscFunctionReturn(0); 1654 } 1655 1656 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1657 { 1658 const PetscScalar *barray; 1659 PetscScalar *xarray; 1660 thrust::device_ptr<const PetscScalar> bGPU; 1661 thrust::device_ptr<PetscScalar> xGPU; 1662 cusparseStatus_t stat; 1663 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1664 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1665 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1666 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1667 PetscErrorCode ierr; 1668 1669 PetscFunctionBegin; 1670 1671 /* Get the GPU pointers */ 1672 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1673 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1674 xGPU = thrust::device_pointer_cast(xarray); 1675 bGPU = thrust::device_pointer_cast(barray); 1676 1677 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1678 /* First, reorder with the row permutation */ 1679 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1680 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1681 tempGPU->begin()); 1682 1683 /* Next, solve L */ 1684 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1685 loTriFactor->csrMat->num_rows, 1686 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1687 loTriFactor->csrMat->num_entries, 1688 #endif 1689 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1690 loTriFactor->csrMat->values->data().get(), 1691 loTriFactor->csrMat->row_offsets->data().get(), 1692 loTriFactor->csrMat->column_indices->data().get(), 1693 loTriFactor->solveInfo, 1694 tempGPU->data().get(), 1695 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1696 xarray, 1697 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1698 #else 1699 xarray);CHKERRCUSPARSE(stat); 1700 #endif 1701 1702 /* Then, solve U */ 1703 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1704 upTriFactor->csrMat->num_rows, 1705 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1706 upTriFactor->csrMat->num_entries, 1707 #endif 1708 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1709 upTriFactor->csrMat->values->data().get(), 1710 upTriFactor->csrMat->row_offsets->data().get(), 1711 upTriFactor->csrMat->column_indices->data().get(), 1712 upTriFactor->solveInfo,xarray, 1713 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1714 tempGPU->data().get(), 1715 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1716 #else 1717 tempGPU->data().get());CHKERRCUSPARSE(stat); 1718 #endif 1719 1720 /* Last, reorder with the column permutation */ 1721 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1722 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1723 xGPU); 1724 1725 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1726 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1727 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1728 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1729 PetscFunctionReturn(0); 1730 } 1731 1732 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1733 { 1734 const PetscScalar *barray; 1735 PetscScalar *xarray; 1736 cusparseStatus_t stat; 1737 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1738 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1739 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1740 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1741 PetscErrorCode ierr; 1742 1743 PetscFunctionBegin; 1744 /* Get the GPU pointers */ 1745 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1746 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1747 1748 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1749 /* First, solve L */ 1750 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1751 loTriFactor->csrMat->num_rows, 1752 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1753 loTriFactor->csrMat->num_entries, 1754 #endif 1755 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1756 loTriFactor->csrMat->values->data().get(), 1757 loTriFactor->csrMat->row_offsets->data().get(), 1758 loTriFactor->csrMat->column_indices->data().get(), 1759 loTriFactor->solveInfo, 1760 barray, 1761 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1762 tempGPU->data().get(), 1763 loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1764 #else 1765 tempGPU->data().get());CHKERRCUSPARSE(stat); 1766 #endif 1767 1768 /* Next, solve U */ 1769 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1770 upTriFactor->csrMat->num_rows, 1771 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1772 upTriFactor->csrMat->num_entries, 1773 #endif 1774 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1775 upTriFactor->csrMat->values->data().get(), 1776 upTriFactor->csrMat->row_offsets->data().get(), 1777 upTriFactor->csrMat->column_indices->data().get(), 1778 upTriFactor->solveInfo, 1779 tempGPU->data().get(), 1780 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1781 xarray, 1782 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1783 #else 1784 xarray);CHKERRCUSPARSE(stat); 1785 #endif 1786 1787 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1788 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1789 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1790 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1791 PetscFunctionReturn(0); 1792 } 1793 1794 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1795 { 1796 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1797 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1798 cudaError_t cerr; 1799 PetscErrorCode ierr; 1800 1801 PetscFunctionBegin; 1802 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1803 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1804 1805 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1806 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1807 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1808 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1809 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1810 A->offloadmask = PETSC_OFFLOAD_BOTH; 1811 } 1812 PetscFunctionReturn(0); 1813 } 1814 1815 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1816 { 1817 PetscErrorCode ierr; 1818 1819 PetscFunctionBegin; 1820 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1821 *array = ((Mat_SeqAIJ*)A->data)->a; 1822 PetscFunctionReturn(0); 1823 } 1824 1825 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1826 { 1827 PetscFunctionBegin; 1828 A->offloadmask = PETSC_OFFLOAD_CPU; 1829 *array = NULL; 1830 PetscFunctionReturn(0); 1831 } 1832 1833 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1834 { 1835 PetscErrorCode ierr; 1836 1837 PetscFunctionBegin; 1838 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1839 *array = ((Mat_SeqAIJ*)A->data)->a; 1840 PetscFunctionReturn(0); 1841 } 1842 1843 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1844 { 1845 PetscFunctionBegin; 1846 *array = NULL; 1847 PetscFunctionReturn(0); 1848 } 1849 1850 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1851 { 1852 PetscFunctionBegin; 1853 *array = ((Mat_SeqAIJ*)A->data)->a; 1854 PetscFunctionReturn(0); 1855 } 1856 1857 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1858 { 1859 PetscFunctionBegin; 1860 A->offloadmask = PETSC_OFFLOAD_CPU; 1861 *array = NULL; 1862 PetscFunctionReturn(0); 1863 } 1864 1865 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1866 { 1867 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1868 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1869 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1870 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1871 PetscErrorCode ierr; 1872 cusparseStatus_t stat; 1873 PetscBool both = PETSC_TRUE; 1874 cudaError_t err; 1875 1876 PetscFunctionBegin; 1877 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1878 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1879 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1880 CsrMatrix *matrix; 1881 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1882 1883 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1884 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1885 matrix->values->assign(a->a, a->a+a->nz); 1886 err = WaitForCUDA();CHKERRCUDA(err); 1887 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1888 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1889 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1890 } else { 1891 PetscInt nnz; 1892 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1893 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1894 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1895 delete cusparsestruct->workVector; 1896 delete cusparsestruct->rowoffsets_gpu; 1897 cusparsestruct->workVector = NULL; 1898 cusparsestruct->rowoffsets_gpu = NULL; 1899 try { 1900 if (a->compressedrow.use) { 1901 m = a->compressedrow.nrows; 1902 ii = a->compressedrow.i; 1903 ridx = a->compressedrow.rindex; 1904 } else { 1905 m = A->rmap->n; 1906 ii = a->i; 1907 ridx = NULL; 1908 } 1909 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1910 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1911 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1912 else nnz = a->nz; 1913 1914 /* create cusparse matrix */ 1915 cusparsestruct->nrows = m; 1916 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1917 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1918 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1919 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1920 1921 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1922 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1923 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1924 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1925 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1926 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1927 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1928 1929 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1930 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1931 /* set the matrix */ 1932 CsrMatrix *mat= new CsrMatrix; 1933 mat->num_rows = m; 1934 mat->num_cols = A->cmap->n; 1935 mat->num_entries = nnz; 1936 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1937 mat->row_offsets->assign(ii, ii + m+1); 1938 1939 mat->column_indices = new THRUSTINTARRAY32(nnz); 1940 mat->column_indices->assign(a->j, a->j+nnz); 1941 1942 mat->values = new THRUSTARRAY(nnz); 1943 if (a->a) mat->values->assign(a->a, a->a+nnz); 1944 1945 /* assign the pointer */ 1946 matstruct->mat = mat; 1947 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1948 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1949 stat = cusparseCreateCsr(&matstruct->matDescr, 1950 mat->num_rows, mat->num_cols, mat->num_entries, 1951 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1952 mat->values->data().get(), 1953 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1954 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1955 } 1956 #endif 1957 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1958 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1959 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1960 #else 1961 CsrMatrix *mat= new CsrMatrix; 1962 mat->num_rows = m; 1963 mat->num_cols = A->cmap->n; 1964 mat->num_entries = nnz; 1965 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1966 mat->row_offsets->assign(ii, ii + m+1); 1967 1968 mat->column_indices = new THRUSTINTARRAY32(nnz); 1969 mat->column_indices->assign(a->j, a->j+nnz); 1970 1971 mat->values = new THRUSTARRAY(nnz); 1972 if (a->a) mat->values->assign(a->a, a->a+nnz); 1973 1974 cusparseHybMat_t hybMat; 1975 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1976 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1977 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1978 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1979 matstruct->descr, mat->values->data().get(), 1980 mat->row_offsets->data().get(), 1981 mat->column_indices->data().get(), 1982 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1983 /* assign the pointer */ 1984 matstruct->mat = hybMat; 1985 1986 if (mat) { 1987 if (mat->values) delete (THRUSTARRAY*)mat->values; 1988 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1989 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1990 delete (CsrMatrix*)mat; 1991 } 1992 #endif 1993 } 1994 1995 /* assign the compressed row indices */ 1996 if (a->compressedrow.use) { 1997 cusparsestruct->workVector = new THRUSTARRAY(m); 1998 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1999 matstruct->cprowIndices->assign(ridx,ridx+m); 2000 tmp = m; 2001 } else { 2002 cusparsestruct->workVector = NULL; 2003 matstruct->cprowIndices = NULL; 2004 tmp = 0; 2005 } 2006 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 2007 2008 /* assign the pointer */ 2009 cusparsestruct->mat = matstruct; 2010 } catch(char *ex) { 2011 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2012 } 2013 err = WaitForCUDA();CHKERRCUDA(err); 2014 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 2015 cusparsestruct->nonzerostate = A->nonzerostate; 2016 } 2017 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2018 } 2019 PetscFunctionReturn(0); 2020 } 2021 2022 struct VecCUDAPlusEquals 2023 { 2024 template <typename Tuple> 2025 __host__ __device__ 2026 void operator()(Tuple t) 2027 { 2028 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2029 } 2030 }; 2031 2032 struct VecCUDAEquals 2033 { 2034 template <typename Tuple> 2035 __host__ __device__ 2036 void operator()(Tuple t) 2037 { 2038 thrust::get<1>(t) = thrust::get<0>(t); 2039 } 2040 }; 2041 2042 struct VecCUDAEqualsReverse 2043 { 2044 template <typename Tuple> 2045 __host__ __device__ 2046 void operator()(Tuple t) 2047 { 2048 thrust::get<0>(t) = thrust::get<1>(t); 2049 } 2050 }; 2051 2052 struct MatMatCusparse { 2053 PetscBool cisdense; 2054 PetscScalar *Bt; 2055 Mat X; 2056 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2057 PetscLogDouble flops; 2058 CsrMatrix *Bcsr; 2059 2060 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2061 cusparseSpMatDescr_t matSpBDescr; 2062 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2063 cusparseDnMatDescr_t matBDescr; 2064 cusparseDnMatDescr_t matCDescr; 2065 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 2066 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2067 void *dBuffer4; 2068 void *dBuffer5; 2069 #endif 2070 size_t mmBufferSize; 2071 void *mmBuffer; 2072 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2073 cusparseSpGEMMDescr_t spgemmDesc; 2074 #endif 2075 }; 2076 2077 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2078 { 2079 PetscErrorCode ierr; 2080 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2081 cudaError_t cerr; 2082 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2083 cusparseStatus_t stat; 2084 #endif 2085 2086 PetscFunctionBegin; 2087 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 2088 delete mmdata->Bcsr; 2089 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2090 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 2091 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 2092 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 2093 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 2094 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2095 if (mmdata->dBuffer4) { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); } 2096 if (mmdata->dBuffer5) { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); } 2097 #endif 2098 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 2099 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 2100 #endif 2101 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 2102 ierr = PetscFree(data);CHKERRQ(ierr); 2103 PetscFunctionReturn(0); 2104 } 2105 2106 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2107 2108 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2109 { 2110 Mat_Product *product = C->product; 2111 Mat A,B; 2112 PetscInt m,n,blda,clda; 2113 PetscBool flg,biscuda; 2114 Mat_SeqAIJCUSPARSE *cusp; 2115 cusparseStatus_t stat; 2116 cusparseOperation_t opA; 2117 const PetscScalar *barray; 2118 PetscScalar *carray; 2119 PetscErrorCode ierr; 2120 MatMatCusparse *mmdata; 2121 Mat_SeqAIJCUSPARSEMultStruct *mat; 2122 CsrMatrix *csrmat; 2123 2124 PetscFunctionBegin; 2125 MatCheckProduct(C,1); 2126 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2127 mmdata = (MatMatCusparse*)product->data; 2128 A = product->A; 2129 B = product->B; 2130 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2131 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2132 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2133 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2134 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2135 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2136 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2137 switch (product->type) { 2138 case MATPRODUCT_AB: 2139 case MATPRODUCT_PtAP: 2140 mat = cusp->mat; 2141 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2142 m = A->rmap->n; 2143 n = B->cmap->n; 2144 break; 2145 case MATPRODUCT_AtB: 2146 if (!A->form_explicit_transpose) { 2147 mat = cusp->mat; 2148 opA = CUSPARSE_OPERATION_TRANSPOSE; 2149 } else { 2150 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2151 mat = cusp->matTranspose; 2152 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2153 } 2154 m = A->cmap->n; 2155 n = B->cmap->n; 2156 break; 2157 case MATPRODUCT_ABt: 2158 case MATPRODUCT_RARt: 2159 mat = cusp->mat; 2160 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2161 m = A->rmap->n; 2162 n = B->rmap->n; 2163 break; 2164 default: 2165 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2166 } 2167 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2168 csrmat = (CsrMatrix*)mat->mat; 2169 /* if the user passed a CPU matrix, copy the data to the GPU */ 2170 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2171 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2172 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2173 2174 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2175 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2176 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2177 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2178 } else { 2179 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2180 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2181 } 2182 2183 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2184 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2185 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2186 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2187 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2188 size_t mmBufferSize; 2189 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2190 if (!mmdata->matBDescr) { 2191 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2192 mmdata->Blda = blda; 2193 } 2194 2195 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2196 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2197 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2198 mmdata->Clda = clda; 2199 } 2200 2201 if (!mat->matDescr) { 2202 stat = cusparseCreateCsr(&mat->matDescr, 2203 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2204 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2205 csrmat->values->data().get(), 2206 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2207 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2208 } 2209 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2210 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2211 mmdata->matCDescr,cusparse_scalartype, 2212 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2213 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2214 cudaError_t cerr; 2215 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2216 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2217 mmdata->mmBufferSize = mmBufferSize; 2218 } 2219 mmdata->initialized = PETSC_TRUE; 2220 } else { 2221 /* to be safe, always update pointers of the mats */ 2222 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2223 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2224 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2225 } 2226 2227 /* do cusparseSpMM, which supports transpose on B */ 2228 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2229 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2230 mmdata->matCDescr,cusparse_scalartype, 2231 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2232 #else 2233 PetscInt k; 2234 /* cusparseXcsrmm does not support transpose on B */ 2235 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2236 cublasHandle_t cublasv2handle; 2237 cublasStatus_t cerr; 2238 2239 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2240 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2241 B->cmap->n,B->rmap->n, 2242 &PETSC_CUSPARSE_ONE ,barray,blda, 2243 &PETSC_CUSPARSE_ZERO,barray,blda, 2244 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2245 blda = B->cmap->n; 2246 k = B->cmap->n; 2247 } else { 2248 k = B->rmap->n; 2249 } 2250 2251 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2252 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2253 csrmat->num_entries,mat->alpha_one,mat->descr, 2254 csrmat->values->data().get(), 2255 csrmat->row_offsets->data().get(), 2256 csrmat->column_indices->data().get(), 2257 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2258 carray,clda);CHKERRCUSPARSE(stat); 2259 #endif 2260 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2261 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2262 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2263 if (product->type == MATPRODUCT_RARt) { 2264 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2265 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2266 } else if (product->type == MATPRODUCT_PtAP) { 2267 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2268 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2269 } else { 2270 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2271 } 2272 if (mmdata->cisdense) { 2273 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2274 } 2275 if (!biscuda) { 2276 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2277 } 2278 PetscFunctionReturn(0); 2279 } 2280 2281 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2282 { 2283 Mat_Product *product = C->product; 2284 Mat A,B; 2285 PetscInt m,n; 2286 PetscBool cisdense,flg; 2287 PetscErrorCode ierr; 2288 MatMatCusparse *mmdata; 2289 Mat_SeqAIJCUSPARSE *cusp; 2290 2291 PetscFunctionBegin; 2292 MatCheckProduct(C,1); 2293 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2294 A = product->A; 2295 B = product->B; 2296 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2297 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2298 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2299 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2300 switch (product->type) { 2301 case MATPRODUCT_AB: 2302 m = A->rmap->n; 2303 n = B->cmap->n; 2304 break; 2305 case MATPRODUCT_AtB: 2306 m = A->cmap->n; 2307 n = B->cmap->n; 2308 break; 2309 case MATPRODUCT_ABt: 2310 m = A->rmap->n; 2311 n = B->rmap->n; 2312 break; 2313 case MATPRODUCT_PtAP: 2314 m = B->cmap->n; 2315 n = B->cmap->n; 2316 break; 2317 case MATPRODUCT_RARt: 2318 m = B->rmap->n; 2319 n = B->rmap->n; 2320 break; 2321 default: 2322 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2323 } 2324 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2325 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2326 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2327 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2328 2329 /* product data */ 2330 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2331 mmdata->cisdense = cisdense; 2332 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2333 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2334 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2335 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2336 } 2337 #endif 2338 /* for these products we need intermediate storage */ 2339 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2340 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2341 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2342 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2343 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2344 } else { 2345 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2346 } 2347 } 2348 C->product->data = mmdata; 2349 C->product->destroy = MatDestroy_MatMatCusparse; 2350 2351 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2352 PetscFunctionReturn(0); 2353 } 2354 2355 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2356 { 2357 Mat_Product *product = C->product; 2358 Mat A,B; 2359 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2360 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2361 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2362 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2363 PetscBool flg; 2364 PetscErrorCode ierr; 2365 cusparseStatus_t stat; 2366 cudaError_t cerr; 2367 MatProductType ptype; 2368 MatMatCusparse *mmdata; 2369 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2370 cusparseSpMatDescr_t BmatSpDescr; 2371 #endif 2372 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2373 2374 PetscFunctionBegin; 2375 MatCheckProduct(C,1); 2376 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2377 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2378 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2379 mmdata = (MatMatCusparse*)C->product->data; 2380 A = product->A; 2381 B = product->B; 2382 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2383 mmdata->reusesym = PETSC_FALSE; 2384 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2385 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2386 Cmat = Ccusp->mat; 2387 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2388 Ccsr = (CsrMatrix*)Cmat->mat; 2389 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2390 goto finalize; 2391 } 2392 if (!c->nz) goto finalize; 2393 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2394 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2395 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2396 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2397 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2398 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2399 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2400 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2401 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2402 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2403 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2404 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2405 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2406 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2407 2408 ptype = product->type; 2409 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2410 ptype = MATPRODUCT_AB; 2411 if (!product->symbolic_used_the_fact_A_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 2412 } 2413 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2414 ptype = MATPRODUCT_AB; 2415 if (!product->symbolic_used_the_fact_B_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 2416 } 2417 switch (ptype) { 2418 case MATPRODUCT_AB: 2419 Amat = Acusp->mat; 2420 Bmat = Bcusp->mat; 2421 break; 2422 case MATPRODUCT_AtB: 2423 Amat = Acusp->matTranspose; 2424 Bmat = Bcusp->mat; 2425 break; 2426 case MATPRODUCT_ABt: 2427 Amat = Acusp->mat; 2428 Bmat = Bcusp->matTranspose; 2429 break; 2430 default: 2431 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2432 } 2433 Cmat = Ccusp->mat; 2434 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2435 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2436 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2437 Acsr = (CsrMatrix*)Amat->mat; 2438 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2439 Ccsr = (CsrMatrix*)Cmat->mat; 2440 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2441 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2442 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2443 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2444 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2445 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2446 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2447 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2448 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2449 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2450 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2451 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2452 #else 2453 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2454 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2455 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2456 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2457 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2458 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2459 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2460 #endif 2461 #else 2462 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2463 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2464 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2465 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2466 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2467 #endif 2468 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2469 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2470 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2471 C->offloadmask = PETSC_OFFLOAD_GPU; 2472 finalize: 2473 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2474 ierr = PetscInfo3(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2475 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2476 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr); 2477 c->reallocs = 0; 2478 C->info.mallocs += 0; 2479 C->info.nz_unneeded = 0; 2480 C->assembled = C->was_assembled = PETSC_TRUE; 2481 C->num_ass++; 2482 PetscFunctionReturn(0); 2483 } 2484 2485 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2486 { 2487 Mat_Product *product = C->product; 2488 Mat A,B; 2489 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2490 Mat_SeqAIJ *a,*b,*c; 2491 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2492 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2493 PetscInt i,j,m,n,k; 2494 PetscBool flg; 2495 PetscErrorCode ierr; 2496 cusparseStatus_t stat; 2497 cudaError_t cerr; 2498 MatProductType ptype; 2499 MatMatCusparse *mmdata; 2500 PetscLogDouble flops; 2501 PetscBool biscompressed,ciscompressed; 2502 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2503 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2504 cusparseSpMatDescr_t BmatSpDescr; 2505 #else 2506 int cnz; 2507 #endif 2508 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2509 2510 PetscFunctionBegin; 2511 MatCheckProduct(C,1); 2512 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2513 A = product->A; 2514 B = product->B; 2515 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2516 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2517 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2518 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2519 a = (Mat_SeqAIJ*)A->data; 2520 b = (Mat_SeqAIJ*)B->data; 2521 /* product data */ 2522 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2523 C->product->data = mmdata; 2524 C->product->destroy = MatDestroy_MatMatCusparse; 2525 2526 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2527 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2528 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2529 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2530 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2531 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2532 2533 ptype = product->type; 2534 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2535 ptype = MATPRODUCT_AB; 2536 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2537 } 2538 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2539 ptype = MATPRODUCT_AB; 2540 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2541 } 2542 biscompressed = PETSC_FALSE; 2543 ciscompressed = PETSC_FALSE; 2544 switch (ptype) { 2545 case MATPRODUCT_AB: 2546 m = A->rmap->n; 2547 n = B->cmap->n; 2548 k = A->cmap->n; 2549 Amat = Acusp->mat; 2550 Bmat = Bcusp->mat; 2551 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2552 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2553 break; 2554 case MATPRODUCT_AtB: 2555 m = A->cmap->n; 2556 n = B->cmap->n; 2557 k = A->rmap->n; 2558 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2559 Amat = Acusp->matTranspose; 2560 Bmat = Bcusp->mat; 2561 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2562 break; 2563 case MATPRODUCT_ABt: 2564 m = A->rmap->n; 2565 n = B->rmap->n; 2566 k = A->cmap->n; 2567 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 2568 Amat = Acusp->mat; 2569 Bmat = Bcusp->matTranspose; 2570 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2571 break; 2572 default: 2573 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2574 } 2575 2576 /* create cusparse matrix */ 2577 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2578 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2579 c = (Mat_SeqAIJ*)C->data; 2580 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2581 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2582 Ccsr = new CsrMatrix; 2583 2584 c->compressedrow.use = ciscompressed; 2585 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2586 c->compressedrow.nrows = a->compressedrow.nrows; 2587 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2588 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2589 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2590 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2591 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2592 } else { 2593 c->compressedrow.nrows = 0; 2594 c->compressedrow.i = NULL; 2595 c->compressedrow.rindex = NULL; 2596 Ccusp->workVector = NULL; 2597 Cmat->cprowIndices = NULL; 2598 } 2599 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2600 Ccusp->mat = Cmat; 2601 Ccusp->mat->mat = Ccsr; 2602 Ccsr->num_rows = Ccusp->nrows; 2603 Ccsr->num_cols = n; 2604 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2605 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2606 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2607 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2608 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2609 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2610 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2611 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2612 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2613 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2614 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2615 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2616 c->nz = 0; 2617 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2618 Ccsr->values = new THRUSTARRAY(c->nz); 2619 goto finalizesym; 2620 } 2621 2622 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2623 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2624 Acsr = (CsrMatrix*)Amat->mat; 2625 if (!biscompressed) { 2626 Bcsr = (CsrMatrix*)Bmat->mat; 2627 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2628 BmatSpDescr = Bmat->matDescr; 2629 #endif 2630 } else { /* we need to use row offsets for the full matrix */ 2631 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2632 Bcsr = new CsrMatrix; 2633 Bcsr->num_rows = B->rmap->n; 2634 Bcsr->num_cols = cBcsr->num_cols; 2635 Bcsr->num_entries = cBcsr->num_entries; 2636 Bcsr->column_indices = cBcsr->column_indices; 2637 Bcsr->values = cBcsr->values; 2638 if (!Bcusp->rowoffsets_gpu) { 2639 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2640 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2641 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2642 } 2643 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2644 mmdata->Bcsr = Bcsr; 2645 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2646 if (Bcsr->num_rows && Bcsr->num_cols) { 2647 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2648 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2649 Bcsr->values->data().get(), 2650 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2651 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2652 } 2653 BmatSpDescr = mmdata->matSpBDescr; 2654 #endif 2655 } 2656 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2657 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2658 /* precompute flops count */ 2659 if (ptype == MATPRODUCT_AB) { 2660 for (i=0, flops = 0; i<A->rmap->n; i++) { 2661 const PetscInt st = a->i[i]; 2662 const PetscInt en = a->i[i+1]; 2663 for (j=st; j<en; j++) { 2664 const PetscInt brow = a->j[j]; 2665 flops += 2.*(b->i[brow+1] - b->i[brow]); 2666 } 2667 } 2668 } else if (ptype == MATPRODUCT_AtB) { 2669 for (i=0, flops = 0; i<A->rmap->n; i++) { 2670 const PetscInt anzi = a->i[i+1] - a->i[i]; 2671 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2672 flops += (2.*anzi)*bnzi; 2673 } 2674 } else { /* TODO */ 2675 flops = 0.; 2676 } 2677 2678 mmdata->flops = flops; 2679 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2680 2681 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2682 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2683 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2684 NULL, NULL, NULL, 2685 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2686 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2687 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2688 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2689 { 2690 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2691 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2692 */ 2693 void* dBuffer1 = NULL; 2694 void* dBuffer2 = NULL; 2695 void* dBuffer3 = NULL; 2696 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2697 size_t bufferSize1 = 0; 2698 size_t bufferSize2 = 0; 2699 size_t bufferSize3 = 0; 2700 size_t bufferSize4 = 0; 2701 size_t bufferSize5 = 0; 2702 2703 /*----------------------------------------------------------------------*/ 2704 /* ask bufferSize1 bytes for external memory */ 2705 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2706 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2707 &bufferSize1, NULL);CHKERRCUSPARSE(stat); 2708 cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr); 2709 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2710 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2711 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2712 &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat); 2713 2714 /*----------------------------------------------------------------------*/ 2715 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2716 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2717 &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat); 2718 cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr); 2719 cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr); 2720 cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr); 2721 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2722 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2723 &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat); 2724 cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr); 2725 cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr); 2726 2727 /*----------------------------------------------------------------------*/ 2728 /* get matrix C non-zero entries C_nnz1 */ 2729 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2730 c->nz = (PetscInt) C_nnz1; 2731 /* allocate matrix C */ 2732 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2733 Ccsr->values = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2734 /* update matC with the new pointers */ 2735 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2736 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2737 2738 /*----------------------------------------------------------------------*/ 2739 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2740 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2741 &bufferSize5, NULL);CHKERRCUSPARSE(stat); 2742 cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr); 2743 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2744 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2745 &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat); 2746 cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr); 2747 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2748 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2749 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2750 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2751 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr); 2752 } 2753 #else 2754 size_t bufSize2; 2755 /* ask bufferSize bytes for external memory */ 2756 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2757 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2758 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2759 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2760 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2761 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2762 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2763 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2764 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2765 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2766 /* ask bufferSize again bytes for external memory */ 2767 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2768 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2769 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2770 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2771 /* The CUSPARSE documentation is not clear, nor the API 2772 We need both buffers to perform the operations properly! 2773 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2774 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2775 is stored in the descriptor! What a messy API... */ 2776 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2777 /* compute the intermediate product of A * B */ 2778 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2779 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2780 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2781 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2782 /* get matrix C non-zero entries C_nnz1 */ 2783 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2784 c->nz = (PetscInt) C_nnz1; 2785 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2786 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2787 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2788 Ccsr->values = new THRUSTARRAY(c->nz); 2789 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2790 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2791 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2792 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2793 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2794 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2795 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2796 #else 2797 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2798 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2799 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2800 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2801 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2802 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2803 c->nz = cnz; 2804 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2805 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2806 Ccsr->values = new THRUSTARRAY(c->nz); 2807 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2808 2809 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2810 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2811 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2812 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2813 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2814 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2815 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2816 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2817 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2818 #endif 2819 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2820 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2821 finalizesym: 2822 c->singlemalloc = PETSC_FALSE; 2823 c->free_a = PETSC_TRUE; 2824 c->free_ij = PETSC_TRUE; 2825 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2826 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2827 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2828 PetscInt *d_i = c->i; 2829 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2830 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2831 ii = *Ccsr->row_offsets; 2832 jj = *Ccsr->column_indices; 2833 if (ciscompressed) d_i = c->compressedrow.i; 2834 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2835 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2836 } else { 2837 PetscInt *d_i = c->i; 2838 if (ciscompressed) d_i = c->compressedrow.i; 2839 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2840 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2841 } 2842 if (ciscompressed) { /* need to expand host row offsets */ 2843 PetscInt r = 0; 2844 c->i[0] = 0; 2845 for (k = 0; k < c->compressedrow.nrows; k++) { 2846 const PetscInt next = c->compressedrow.rindex[k]; 2847 const PetscInt old = c->compressedrow.i[k]; 2848 for (; r < next; r++) c->i[r+1] = old; 2849 } 2850 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2851 } 2852 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2853 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2854 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2855 c->maxnz = c->nz; 2856 c->nonzerorowcnt = 0; 2857 c->rmax = 0; 2858 for (k = 0; k < m; k++) { 2859 const PetscInt nn = c->i[k+1] - c->i[k]; 2860 c->ilen[k] = c->imax[k] = nn; 2861 c->nonzerorowcnt += (PetscInt)!!nn; 2862 c->rmax = PetscMax(c->rmax,nn); 2863 } 2864 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2865 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2866 Ccsr->num_entries = c->nz; 2867 2868 C->nonzerostate++; 2869 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2870 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2871 Ccusp->nonzerostate = C->nonzerostate; 2872 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2873 C->preallocated = PETSC_TRUE; 2874 C->assembled = PETSC_FALSE; 2875 C->was_assembled = PETSC_FALSE; 2876 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2877 mmdata->reusesym = PETSC_TRUE; 2878 C->offloadmask = PETSC_OFFLOAD_GPU; 2879 } 2880 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2881 PetscFunctionReturn(0); 2882 } 2883 2884 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2885 2886 /* handles sparse or dense B */ 2887 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2888 { 2889 Mat_Product *product = mat->product; 2890 PetscErrorCode ierr; 2891 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2892 2893 PetscFunctionBegin; 2894 MatCheckProduct(mat,1); 2895 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2896 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2897 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2898 } 2899 if (product->type == MATPRODUCT_ABC) { 2900 Ciscusp = PETSC_FALSE; 2901 if (!product->C->boundtocpu) { 2902 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2903 } 2904 } 2905 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 2906 PetscBool usecpu = PETSC_FALSE; 2907 switch (product->type) { 2908 case MATPRODUCT_AB: 2909 if (product->api_user) { 2910 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 2911 ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2912 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2913 } else { 2914 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 2915 ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2916 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2917 } 2918 break; 2919 case MATPRODUCT_AtB: 2920 if (product->api_user) { 2921 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 2922 ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2923 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2924 } else { 2925 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 2926 ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2927 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2928 } 2929 break; 2930 case MATPRODUCT_PtAP: 2931 if (product->api_user) { 2932 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 2933 ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2934 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2935 } else { 2936 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 2937 ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2938 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2939 } 2940 break; 2941 case MATPRODUCT_RARt: 2942 if (product->api_user) { 2943 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 2944 ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2945 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2946 } else { 2947 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 2948 ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2949 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2950 } 2951 break; 2952 case MATPRODUCT_ABC: 2953 if (product->api_user) { 2954 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 2955 ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2956 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2957 } else { 2958 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 2959 ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2960 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2961 } 2962 break; 2963 default: 2964 break; 2965 } 2966 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 2967 } 2968 /* dispatch */ 2969 if (isdense) { 2970 switch (product->type) { 2971 case MATPRODUCT_AB: 2972 case MATPRODUCT_AtB: 2973 case MATPRODUCT_ABt: 2974 case MATPRODUCT_PtAP: 2975 case MATPRODUCT_RARt: 2976 if (product->A->boundtocpu) { 2977 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2978 } else { 2979 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2980 } 2981 break; 2982 case MATPRODUCT_ABC: 2983 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2984 break; 2985 default: 2986 break; 2987 } 2988 } else if (Biscusp && Ciscusp) { 2989 switch (product->type) { 2990 case MATPRODUCT_AB: 2991 case MATPRODUCT_AtB: 2992 case MATPRODUCT_ABt: 2993 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2994 break; 2995 case MATPRODUCT_PtAP: 2996 case MATPRODUCT_RARt: 2997 case MATPRODUCT_ABC: 2998 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2999 break; 3000 default: 3001 break; 3002 } 3003 } else { /* fallback for AIJ */ 3004 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 3005 } 3006 PetscFunctionReturn(0); 3007 } 3008 3009 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3010 { 3011 PetscErrorCode ierr; 3012 3013 PetscFunctionBegin; 3014 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 3015 PetscFunctionReturn(0); 3016 } 3017 3018 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 3019 { 3020 PetscErrorCode ierr; 3021 3022 PetscFunctionBegin; 3023 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 3024 PetscFunctionReturn(0); 3025 } 3026 3027 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3028 { 3029 PetscErrorCode ierr; 3030 3031 PetscFunctionBegin; 3032 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 3033 PetscFunctionReturn(0); 3034 } 3035 3036 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3037 { 3038 PetscErrorCode ierr; 3039 3040 PetscFunctionBegin; 3041 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 3042 PetscFunctionReturn(0); 3043 } 3044 3045 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3046 { 3047 PetscErrorCode ierr; 3048 3049 PetscFunctionBegin; 3050 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3051 PetscFunctionReturn(0); 3052 } 3053 3054 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 3055 { 3056 int i = blockIdx.x*blockDim.x + threadIdx.x; 3057 if (i < n) y[idx[i]] += x[i]; 3058 } 3059 3060 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3061 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 3062 { 3063 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3064 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 3065 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3066 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 3067 PetscErrorCode ierr; 3068 cusparseStatus_t stat; 3069 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3070 PetscBool compressed; 3071 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3072 PetscInt nx,ny; 3073 #endif 3074 3075 PetscFunctionBegin; 3076 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 3077 if (!a->nonzerorowcnt) { 3078 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 3079 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 3080 PetscFunctionReturn(0); 3081 } 3082 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3083 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3084 if (!trans) { 3085 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3086 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3087 } else { 3088 if (herm || !A->form_explicit_transpose) { 3089 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3090 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3091 } else { 3092 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);} 3093 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3094 } 3095 } 3096 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3097 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3098 3099 try { 3100 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3101 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 3102 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 3103 3104 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3105 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3106 /* z = A x + beta y. 3107 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3108 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3109 */ 3110 xptr = xarray; 3111 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3112 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3113 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3114 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3115 allocated to accommodate different uses. So we get the length info directly from mat. 3116 */ 3117 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3118 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3119 nx = mat->num_cols; 3120 ny = mat->num_rows; 3121 } 3122 #endif 3123 } else { 3124 /* z = A^T x + beta y 3125 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3126 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3127 */ 3128 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3129 dptr = zarray; 3130 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3131 if (compressed) { /* Scatter x to work vector */ 3132 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3133 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3134 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3135 VecCUDAEqualsReverse()); 3136 } 3137 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3138 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3139 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3140 nx = mat->num_rows; 3141 ny = mat->num_cols; 3142 } 3143 #endif 3144 } 3145 3146 /* csr_spmv does y = alpha op(A) x + beta y */ 3147 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3148 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3149 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3150 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3151 cudaError_t cerr; 3152 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3153 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3154 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3155 matstruct->matDescr, 3156 matstruct->cuSpMV[opA].vecXDescr, beta, 3157 matstruct->cuSpMV[opA].vecYDescr, 3158 cusparse_scalartype, 3159 cusparsestruct->spmvAlg, 3160 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 3161 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 3162 3163 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3164 } else { 3165 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3166 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 3167 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 3168 } 3169 3170 stat = cusparseSpMV(cusparsestruct->handle, opA, 3171 matstruct->alpha_one, 3172 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3173 matstruct->cuSpMV[opA].vecXDescr, 3174 beta, 3175 matstruct->cuSpMV[opA].vecYDescr, 3176 cusparse_scalartype, 3177 cusparsestruct->spmvAlg, 3178 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 3179 #else 3180 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3181 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 3182 mat->num_rows, mat->num_cols, 3183 mat->num_entries, matstruct->alpha_one, matstruct->descr, 3184 mat->values->data().get(), mat->row_offsets->data().get(), 3185 mat->column_indices->data().get(), xptr, beta, 3186 dptr);CHKERRCUSPARSE(stat); 3187 #endif 3188 } else { 3189 if (cusparsestruct->nrows) { 3190 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3191 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3192 #else 3193 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3194 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 3195 matstruct->alpha_one, matstruct->descr, hybMat, 3196 xptr, beta, 3197 dptr);CHKERRCUSPARSE(stat); 3198 #endif 3199 } 3200 } 3201 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3202 3203 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3204 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3205 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3206 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 3207 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3208 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3209 } 3210 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3211 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 3212 } 3213 3214 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3215 if (compressed) { 3216 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3217 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3218 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3219 prevent that. So I just add a ScatterAdd kernel. 3220 */ 3221 #if 0 3222 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3223 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3224 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3225 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3226 VecCUDAPlusEquals()); 3227 #else 3228 PetscInt n = matstruct->cprowIndices->size(); 3229 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3230 #endif 3231 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3232 } 3233 } else { 3234 if (yy && yy != zz) { 3235 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3236 } 3237 } 3238 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3239 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 3240 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 3241 } catch(char *ex) { 3242 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3243 } 3244 if (yy) { 3245 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 3246 } else { 3247 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 3248 } 3249 PetscFunctionReturn(0); 3250 } 3251 3252 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3253 { 3254 PetscErrorCode ierr; 3255 3256 PetscFunctionBegin; 3257 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3258 PetscFunctionReturn(0); 3259 } 3260 3261 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3262 { 3263 PetscErrorCode ierr; 3264 PetscObjectState onnz = A->nonzerostate; 3265 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3266 3267 PetscFunctionBegin; 3268 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 3269 if (onnz != A->nonzerostate && cusp->deviceMat) { 3270 cudaError_t cerr; 3271 3272 ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 3273 cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr); 3274 cusp->deviceMat = NULL; 3275 } 3276 PetscFunctionReturn(0); 3277 } 3278 3279 /* --------------------------------------------------------------------------------*/ 3280 /*@ 3281 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3282 (the default parallel PETSc format). This matrix will ultimately pushed down 3283 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3284 assembly performance the user should preallocate the matrix storage by setting 3285 the parameter nz (or the array nnz). By setting these parameters accurately, 3286 performance during matrix assembly can be increased by more than a factor of 50. 3287 3288 Collective 3289 3290 Input Parameters: 3291 + comm - MPI communicator, set to PETSC_COMM_SELF 3292 . m - number of rows 3293 . n - number of columns 3294 . nz - number of nonzeros per row (same for all rows) 3295 - nnz - array containing the number of nonzeros in the various rows 3296 (possibly different for each row) or NULL 3297 3298 Output Parameter: 3299 . A - the matrix 3300 3301 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3302 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3303 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3304 3305 Notes: 3306 If nnz is given then nz is ignored 3307 3308 The AIJ format (also called the Yale sparse matrix format or 3309 compressed row storage), is fully compatible with standard Fortran 77 3310 storage. That is, the stored row and column indices can begin at 3311 either one (as in Fortran) or zero. See the users' manual for details. 3312 3313 Specify the preallocated storage with either nz or nnz (not both). 3314 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3315 allocation. For large problems you MUST preallocate memory or you 3316 will get TERRIBLE performance, see the users' manual chapter on matrices. 3317 3318 By default, this format uses inodes (identical nodes) when possible, to 3319 improve numerical efficiency of matrix-vector products and solves. We 3320 search for consecutive rows with the same nonzero structure, thereby 3321 reusing matrix information to achieve increased efficiency. 3322 3323 Level: intermediate 3324 3325 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3326 @*/ 3327 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3328 { 3329 PetscErrorCode ierr; 3330 3331 PetscFunctionBegin; 3332 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3333 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3334 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3335 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3336 PetscFunctionReturn(0); 3337 } 3338 3339 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3340 { 3341 PetscErrorCode ierr; 3342 3343 PetscFunctionBegin; 3344 if (A->factortype == MAT_FACTOR_NONE) { 3345 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3346 } else { 3347 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3348 } 3349 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3350 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3351 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL);CHKERRQ(ierr); 3352 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3353 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3354 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3355 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3356 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3357 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3358 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 3359 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3360 PetscFunctionReturn(0); 3361 } 3362 3363 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3364 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3365 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3366 { 3367 PetscErrorCode ierr; 3368 3369 PetscFunctionBegin; 3370 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3371 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3372 PetscFunctionReturn(0); 3373 } 3374 3375 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3376 { 3377 PetscErrorCode ierr; 3378 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3379 Mat_SeqAIJCUSPARSE *cy; 3380 Mat_SeqAIJCUSPARSE *cx; 3381 PetscScalar *ay; 3382 const PetscScalar *ax; 3383 CsrMatrix *csry,*csrx; 3384 3385 PetscFunctionBegin; 3386 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3387 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3388 if (X->ops->axpy != Y->ops->axpy) { 3389 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3390 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3391 PetscFunctionReturn(0); 3392 } 3393 /* if we are here, it means both matrices are bound to GPU */ 3394 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3395 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3396 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3397 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3398 csry = (CsrMatrix*)cy->mat->mat; 3399 csrx = (CsrMatrix*)cx->mat->mat; 3400 /* see if we can turn this into a cublas axpy */ 3401 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3402 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3403 if (eq) { 3404 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3405 } 3406 if (eq) str = SAME_NONZERO_PATTERN; 3407 } 3408 /* spgeam is buggy with one column */ 3409 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3410 3411 if (str == SUBSET_NONZERO_PATTERN) { 3412 cusparseStatus_t stat; 3413 PetscScalar b = 1.0; 3414 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3415 size_t bufferSize; 3416 void *buffer; 3417 cudaError_t cerr; 3418 #endif 3419 3420 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3421 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3422 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3423 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3424 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3425 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3426 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3427 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3428 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3429 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3430 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3431 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3432 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3433 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3434 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3435 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3436 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3437 #else 3438 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3439 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3440 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3441 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3442 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3443 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3444 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3445 #endif 3446 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3447 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3448 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3449 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3450 } else if (str == SAME_NONZERO_PATTERN) { 3451 cublasHandle_t cublasv2handle; 3452 cublasStatus_t berr; 3453 PetscBLASInt one = 1, bnz = 1; 3454 3455 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3456 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3457 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3458 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3459 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3460 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3461 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3462 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3463 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3464 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3465 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3466 } else { 3467 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3468 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3469 } 3470 PetscFunctionReturn(0); 3471 } 3472 3473 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3474 { 3475 PetscErrorCode ierr; 3476 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3477 PetscScalar *ay; 3478 cublasHandle_t cublasv2handle; 3479 cublasStatus_t berr; 3480 PetscBLASInt one = 1, bnz = 1; 3481 3482 PetscFunctionBegin; 3483 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3484 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3485 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3486 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3487 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3488 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3489 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3490 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3491 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3492 PetscFunctionReturn(0); 3493 } 3494 3495 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3496 { 3497 PetscErrorCode ierr; 3498 PetscBool both = PETSC_FALSE; 3499 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3500 3501 PetscFunctionBegin; 3502 if (A->factortype == MAT_FACTOR_NONE) { 3503 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3504 if (spptr->mat) { 3505 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3506 if (matrix->values) { 3507 both = PETSC_TRUE; 3508 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3509 } 3510 } 3511 if (spptr->matTranspose) { 3512 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3513 if (matrix->values) { 3514 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3515 } 3516 } 3517 } 3518 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3519 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3520 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3521 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3522 else A->offloadmask = PETSC_OFFLOAD_CPU; 3523 PetscFunctionReturn(0); 3524 } 3525 3526 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3527 { 3528 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3529 PetscErrorCode ierr; 3530 3531 PetscFunctionBegin; 3532 if (A->factortype != MAT_FACTOR_NONE) { 3533 A->boundtocpu = flg; 3534 PetscFunctionReturn(0); 3535 } 3536 if (flg) { 3537 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3538 3539 A->ops->scale = MatScale_SeqAIJ; 3540 A->ops->axpy = MatAXPY_SeqAIJ; 3541 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3542 A->ops->mult = MatMult_SeqAIJ; 3543 A->ops->multadd = MatMultAdd_SeqAIJ; 3544 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3545 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3546 A->ops->multhermitiantranspose = NULL; 3547 A->ops->multhermitiantransposeadd = NULL; 3548 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3549 ierr = PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps));CHKERRQ(ierr); 3550 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3551 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3552 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3553 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3554 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3555 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3556 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3557 } else { 3558 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3559 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3560 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3561 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3562 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3563 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3564 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3565 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3566 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3567 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3568 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3569 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3570 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 3571 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 3572 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 3573 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3574 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3575 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3576 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3577 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3578 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3579 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3580 } 3581 A->boundtocpu = flg; 3582 if (flg && a->inode.size) { 3583 a->inode.use = PETSC_TRUE; 3584 } else { 3585 a->inode.use = PETSC_FALSE; 3586 } 3587 PetscFunctionReturn(0); 3588 } 3589 3590 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3591 { 3592 PetscErrorCode ierr; 3593 cusparseStatus_t stat; 3594 Mat B; 3595 3596 PetscFunctionBegin; 3597 ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3598 if (reuse == MAT_INITIAL_MATRIX) { 3599 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3600 } else if (reuse == MAT_REUSE_MATRIX) { 3601 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3602 } 3603 B = *newmat; 3604 3605 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3606 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3607 3608 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3609 if (B->factortype == MAT_FACTOR_NONE) { 3610 Mat_SeqAIJCUSPARSE *spptr; 3611 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3612 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3613 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3614 spptr->format = MAT_CUSPARSE_CSR; 3615 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3616 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3617 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3618 #else 3619 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3620 #endif 3621 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3622 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3623 #endif 3624 B->spptr = spptr; 3625 } else { 3626 Mat_SeqAIJCUSPARSETriFactors *spptr; 3627 3628 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3629 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3630 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3631 B->spptr = spptr; 3632 } 3633 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3634 } 3635 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3636 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3637 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3638 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3639 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3640 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3641 3642 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3643 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3644 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3645 #if defined(PETSC_HAVE_HYPRE) 3646 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 3647 #endif 3648 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE);CHKERRQ(ierr); 3649 PetscFunctionReturn(0); 3650 } 3651 3652 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3653 { 3654 PetscErrorCode ierr; 3655 3656 PetscFunctionBegin; 3657 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3658 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3659 PetscFunctionReturn(0); 3660 } 3661 3662 /*MC 3663 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3664 3665 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3666 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3667 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3668 3669 Options Database Keys: 3670 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3671 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3672 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3673 + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU 3674 3675 Level: beginner 3676 3677 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3678 M*/ 3679 3680 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3681 3682 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3683 { 3684 PetscErrorCode ierr; 3685 3686 PetscFunctionBegin; 3687 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 3688 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3689 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3690 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3691 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3692 3693 PetscFunctionReturn(0); 3694 } 3695 3696 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3697 { 3698 PetscErrorCode ierr; 3699 cusparseStatus_t stat; 3700 3701 PetscFunctionBegin; 3702 if (*cusparsestruct) { 3703 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3704 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3705 delete (*cusparsestruct)->workVector; 3706 delete (*cusparsestruct)->rowoffsets_gpu; 3707 delete (*cusparsestruct)->cooPerm; 3708 delete (*cusparsestruct)->cooPerm_a; 3709 delete (*cusparsestruct)->csr2csc_i; 3710 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3711 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3712 } 3713 PetscFunctionReturn(0); 3714 } 3715 3716 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3717 { 3718 PetscFunctionBegin; 3719 if (*mat) { 3720 delete (*mat)->values; 3721 delete (*mat)->column_indices; 3722 delete (*mat)->row_offsets; 3723 delete *mat; 3724 *mat = 0; 3725 } 3726 PetscFunctionReturn(0); 3727 } 3728 3729 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3730 { 3731 cusparseStatus_t stat; 3732 PetscErrorCode ierr; 3733 3734 PetscFunctionBegin; 3735 if (*trifactor) { 3736 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3737 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3738 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3739 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3740 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3741 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3742 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3743 #endif 3744 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3745 } 3746 PetscFunctionReturn(0); 3747 } 3748 3749 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3750 { 3751 CsrMatrix *mat; 3752 cusparseStatus_t stat; 3753 cudaError_t err; 3754 3755 PetscFunctionBegin; 3756 if (*matstruct) { 3757 if ((*matstruct)->mat) { 3758 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3759 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3760 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3761 #else 3762 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3763 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3764 #endif 3765 } else { 3766 mat = (CsrMatrix*)(*matstruct)->mat; 3767 CsrMatrix_Destroy(&mat); 3768 } 3769 } 3770 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3771 delete (*matstruct)->cprowIndices; 3772 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3773 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3774 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3775 3776 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3777 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3778 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3779 for (int i=0; i<3; i++) { 3780 if (mdata->cuSpMV[i].initialized) { 3781 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3782 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3783 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3784 } 3785 } 3786 #endif 3787 delete *matstruct; 3788 *matstruct = NULL; 3789 } 3790 PetscFunctionReturn(0); 3791 } 3792 3793 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3794 { 3795 PetscErrorCode ierr; 3796 3797 PetscFunctionBegin; 3798 if (*trifactors) { 3799 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3800 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3801 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3802 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3803 delete (*trifactors)->rpermIndices; 3804 delete (*trifactors)->cpermIndices; 3805 delete (*trifactors)->workVector; 3806 (*trifactors)->rpermIndices = NULL; 3807 (*trifactors)->cpermIndices = NULL; 3808 (*trifactors)->workVector = NULL; 3809 if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3810 if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3811 (*trifactors)->init_dev_prop = PETSC_FALSE; 3812 } 3813 PetscFunctionReturn(0); 3814 } 3815 3816 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3817 { 3818 PetscErrorCode ierr; 3819 cusparseHandle_t handle; 3820 cusparseStatus_t stat; 3821 3822 PetscFunctionBegin; 3823 if (*trifactors) { 3824 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3825 if (handle = (*trifactors)->handle) { 3826 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3827 } 3828 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3829 } 3830 PetscFunctionReturn(0); 3831 } 3832 3833 struct IJCompare 3834 { 3835 __host__ __device__ 3836 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3837 { 3838 if (t1.get<0>() < t2.get<0>()) return true; 3839 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3840 return false; 3841 } 3842 }; 3843 3844 struct IJEqual 3845 { 3846 __host__ __device__ 3847 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3848 { 3849 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3850 return true; 3851 } 3852 }; 3853 3854 struct IJDiff 3855 { 3856 __host__ __device__ 3857 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3858 { 3859 return t1 == t2 ? 0 : 1; 3860 } 3861 }; 3862 3863 struct IJSum 3864 { 3865 __host__ __device__ 3866 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3867 { 3868 return t1||t2; 3869 } 3870 }; 3871 3872 #include <thrust/iterator/discard_iterator.h> 3873 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3874 { 3875 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3876 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3877 THRUSTARRAY *cooPerm_v = NULL; 3878 thrust::device_ptr<const PetscScalar> d_v; 3879 CsrMatrix *matrix; 3880 PetscErrorCode ierr; 3881 PetscInt n; 3882 3883 PetscFunctionBegin; 3884 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3885 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3886 if (!cusp->cooPerm) { 3887 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3888 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3889 PetscFunctionReturn(0); 3890 } 3891 matrix = (CsrMatrix*)cusp->mat->mat; 3892 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3893 if (!v) { 3894 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3895 goto finalize; 3896 } 3897 n = cusp->cooPerm->size(); 3898 if (isCudaMem(v)) { 3899 d_v = thrust::device_pointer_cast(v); 3900 } else { 3901 cooPerm_v = new THRUSTARRAY(n); 3902 cooPerm_v->assign(v,v+n); 3903 d_v = cooPerm_v->data(); 3904 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3905 } 3906 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3907 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3908 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3909 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3910 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3911 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3912 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3913 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3914 */ 3915 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3916 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3917 delete cooPerm_w; 3918 } else { 3919 /* all nonzeros in d_v[] are unique entries */ 3920 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3921 matrix->values->begin())); 3922 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3923 matrix->values->end())); 3924 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 3925 } 3926 } else { 3927 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3928 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3929 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3930 } else { 3931 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3932 matrix->values->begin())); 3933 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3934 matrix->values->end())); 3935 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3936 } 3937 } 3938 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3939 finalize: 3940 delete cooPerm_v; 3941 A->offloadmask = PETSC_OFFLOAD_GPU; 3942 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3943 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3944 ierr = PetscInfo3(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3945 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3946 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax);CHKERRQ(ierr); 3947 a->reallocs = 0; 3948 A->info.mallocs += 0; 3949 A->info.nz_unneeded = 0; 3950 A->assembled = A->was_assembled = PETSC_TRUE; 3951 A->num_ass++; 3952 PetscFunctionReturn(0); 3953 } 3954 3955 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3956 { 3957 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3958 PetscErrorCode ierr; 3959 3960 PetscFunctionBegin; 3961 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3962 if (!cusp) PetscFunctionReturn(0); 3963 if (destroy) { 3964 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3965 delete cusp->csr2csc_i; 3966 cusp->csr2csc_i = NULL; 3967 } 3968 A->transupdated = PETSC_FALSE; 3969 PetscFunctionReturn(0); 3970 } 3971 3972 #include <thrust/binary_search.h> 3973 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 3974 { 3975 PetscErrorCode ierr; 3976 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3977 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3978 PetscInt cooPerm_n, nzr = 0; 3979 cudaError_t cerr; 3980 3981 PetscFunctionBegin; 3982 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3983 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3984 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3985 if (n != cooPerm_n) { 3986 delete cusp->cooPerm; 3987 delete cusp->cooPerm_a; 3988 cusp->cooPerm = NULL; 3989 cusp->cooPerm_a = NULL; 3990 } 3991 if (n) { 3992 THRUSTINTARRAY d_i(n); 3993 THRUSTINTARRAY d_j(n); 3994 THRUSTINTARRAY ii(A->rmap->n); 3995 3996 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3997 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3998 3999 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 4000 d_i.assign(coo_i,coo_i+n); 4001 d_j.assign(coo_j,coo_j+n); 4002 4003 /* Ex. 4004 n = 6 4005 coo_i = [3,3,1,4,1,4] 4006 coo_j = [3,2,2,5,2,6] 4007 */ 4008 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 4009 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 4010 4011 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4012 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4013 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4014 *cusp->cooPerm_a = d_i; /* copy the sorted array */ 4015 THRUSTINTARRAY w = d_j; 4016 4017 /* 4018 d_i = [1,1,3,3,4,4] 4019 d_j = [2,2,2,3,5,6] 4020 cooPerm = [2,4,1,0,3,5] 4021 */ 4022 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4023 4024 /* 4025 d_i = [1,3,3,4,4,x] 4026 ^ekey 4027 d_j = [2,2,3,5,6,x] 4028 ^nekye 4029 */ 4030 if (nekey == ekey) { /* all entries are unique */ 4031 delete cusp->cooPerm_a; 4032 cusp->cooPerm_a = NULL; 4033 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4034 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4035 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4036 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4037 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 4038 w[0] = 0; 4039 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4040 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 4041 } 4042 thrust::counting_iterator<PetscInt> search_begin(0); 4043 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4044 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4045 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 4046 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4047 4048 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 4049 a->singlemalloc = PETSC_FALSE; 4050 a->free_a = PETSC_TRUE; 4051 a->free_ij = PETSC_TRUE; 4052 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 4053 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 4054 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4055 a->nz = a->maxnz = a->i[A->rmap->n]; 4056 a->rmax = 0; 4057 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 4058 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 4059 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4060 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 4061 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 4062 for (PetscInt i = 0; i < A->rmap->n; i++) { 4063 const PetscInt nnzr = a->i[i+1] - a->i[i]; 4064 nzr += (PetscInt)!!(nnzr); 4065 a->ilen[i] = a->imax[i] = nnzr; 4066 a->rmax = PetscMax(a->rmax,nnzr); 4067 } 4068 a->nonzerorowcnt = nzr; 4069 A->preallocated = PETSC_TRUE; 4070 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 4071 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 4072 } else { 4073 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 4074 } 4075 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 4076 4077 /* We want to allocate the CUSPARSE struct for matvec now. 4078 The code is so convoluted now that I prefer to copy zeros */ 4079 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 4080 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 4081 A->offloadmask = PETSC_OFFLOAD_CPU; 4082 A->nonzerostate++; 4083 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4084 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 4085 4086 A->assembled = PETSC_FALSE; 4087 A->was_assembled = PETSC_FALSE; 4088 PetscFunctionReturn(0); 4089 } 4090 4091 /*@C 4092 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 4093 4094 Not collective 4095 4096 Input Parameters: 4097 + A - the matrix 4098 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4099 4100 Output Parameters: 4101 + ia - the CSR row pointers 4102 - ja - the CSR column indices 4103 4104 Level: developer 4105 4106 Notes: 4107 When compressed is true, the CSR structure does not contain empty rows 4108 4109 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead() 4110 @*/ 4111 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4112 { 4113 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4114 CsrMatrix *csr; 4115 PetscErrorCode ierr; 4116 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4117 4118 PetscFunctionBegin; 4119 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4120 if (!i || !j) PetscFunctionReturn(0); 4121 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4122 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4123 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4124 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4125 csr = (CsrMatrix*)cusp->mat->mat; 4126 if (i) { 4127 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4128 if (!cusp->rowoffsets_gpu) { 4129 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4130 cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4131 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4132 } 4133 *i = cusp->rowoffsets_gpu->data().get(); 4134 } else *i = csr->row_offsets->data().get(); 4135 } 4136 if (j) *j = csr->column_indices->data().get(); 4137 PetscFunctionReturn(0); 4138 } 4139 4140 /*@C 4141 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 4142 4143 Not collective 4144 4145 Input Parameters: 4146 + A - the matrix 4147 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4148 4149 Output Parameters: 4150 + ia - the CSR row pointers 4151 - ja - the CSR column indices 4152 4153 Level: developer 4154 4155 .seealso: MatSeqAIJCUSPARSEGetIJ() 4156 @*/ 4157 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4158 { 4159 PetscFunctionBegin; 4160 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4161 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4162 if (i) *i = NULL; 4163 if (j) *j = NULL; 4164 PetscFunctionReturn(0); 4165 } 4166 4167 /*@C 4168 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4169 4170 Not Collective 4171 4172 Input Parameter: 4173 . A - a MATSEQAIJCUSPARSE matrix 4174 4175 Output Parameter: 4176 . a - pointer to the device data 4177 4178 Level: developer 4179 4180 Notes: may trigger host-device copies if up-to-date matrix data is on host 4181 4182 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead() 4183 @*/ 4184 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4185 { 4186 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4187 CsrMatrix *csr; 4188 PetscErrorCode ierr; 4189 4190 PetscFunctionBegin; 4191 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4192 PetscValidPointer(a,2); 4193 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4194 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4195 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4196 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4197 csr = (CsrMatrix*)cusp->mat->mat; 4198 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4199 *a = csr->values->data().get(); 4200 PetscFunctionReturn(0); 4201 } 4202 4203 /*@C 4204 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 4205 4206 Not Collective 4207 4208 Input Parameter: 4209 . A - a MATSEQAIJCUSPARSE matrix 4210 4211 Output Parameter: 4212 . a - pointer to the device data 4213 4214 Level: developer 4215 4216 .seealso: MatSeqAIJCUSPARSEGetArrayRead() 4217 @*/ 4218 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4219 { 4220 PetscFunctionBegin; 4221 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4222 PetscValidPointer(a,2); 4223 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4224 *a = NULL; 4225 PetscFunctionReturn(0); 4226 } 4227 4228 /*@C 4229 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4230 4231 Not Collective 4232 4233 Input Parameter: 4234 . A - a MATSEQAIJCUSPARSE matrix 4235 4236 Output Parameter: 4237 . a - pointer to the device data 4238 4239 Level: developer 4240 4241 Notes: may trigger host-device copies if up-to-date matrix data is on host 4242 4243 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray() 4244 @*/ 4245 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4246 { 4247 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4248 CsrMatrix *csr; 4249 PetscErrorCode ierr; 4250 4251 PetscFunctionBegin; 4252 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4253 PetscValidPointer(a,2); 4254 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4255 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4256 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4257 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4258 csr = (CsrMatrix*)cusp->mat->mat; 4259 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4260 *a = csr->values->data().get(); 4261 A->offloadmask = PETSC_OFFLOAD_GPU; 4262 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4263 PetscFunctionReturn(0); 4264 } 4265 /*@C 4266 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4267 4268 Not Collective 4269 4270 Input Parameter: 4271 . A - a MATSEQAIJCUSPARSE matrix 4272 4273 Output Parameter: 4274 . a - pointer to the device data 4275 4276 Level: developer 4277 4278 .seealso: MatSeqAIJCUSPARSEGetArray() 4279 @*/ 4280 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4281 { 4282 PetscErrorCode ierr; 4283 4284 PetscFunctionBegin; 4285 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4286 PetscValidPointer(a,2); 4287 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4288 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4289 *a = NULL; 4290 PetscFunctionReturn(0); 4291 } 4292 4293 /*@C 4294 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4295 4296 Not Collective 4297 4298 Input Parameter: 4299 . A - a MATSEQAIJCUSPARSE matrix 4300 4301 Output Parameter: 4302 . a - pointer to the device data 4303 4304 Level: developer 4305 4306 Notes: does not trigger host-device copies and flags data validity on the GPU 4307 4308 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite() 4309 @*/ 4310 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4311 { 4312 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4313 CsrMatrix *csr; 4314 PetscErrorCode ierr; 4315 4316 PetscFunctionBegin; 4317 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4318 PetscValidPointer(a,2); 4319 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4320 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4321 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4322 csr = (CsrMatrix*)cusp->mat->mat; 4323 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4324 *a = csr->values->data().get(); 4325 A->offloadmask = PETSC_OFFLOAD_GPU; 4326 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4327 PetscFunctionReturn(0); 4328 } 4329 4330 /*@C 4331 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 4332 4333 Not Collective 4334 4335 Input Parameter: 4336 . A - a MATSEQAIJCUSPARSE matrix 4337 4338 Output Parameter: 4339 . a - pointer to the device data 4340 4341 Level: developer 4342 4343 .seealso: MatSeqAIJCUSPARSEGetArrayWrite() 4344 @*/ 4345 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4346 { 4347 PetscErrorCode ierr; 4348 4349 PetscFunctionBegin; 4350 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4351 PetscValidPointer(a,2); 4352 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4353 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4354 *a = NULL; 4355 PetscFunctionReturn(0); 4356 } 4357 4358 struct IJCompare4 4359 { 4360 __host__ __device__ 4361 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4362 { 4363 if (t1.get<0>() < t2.get<0>()) return true; 4364 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4365 return false; 4366 } 4367 }; 4368 4369 struct Shift 4370 { 4371 int _shift; 4372 4373 Shift(int shift) : _shift(shift) {} 4374 __host__ __device__ 4375 inline int operator() (const int &c) 4376 { 4377 return c + _shift; 4378 } 4379 }; 4380 4381 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4382 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4383 { 4384 PetscErrorCode ierr; 4385 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4386 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4387 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4388 CsrMatrix *Acsr,*Bcsr,*Ccsr; 4389 PetscInt Annz,Bnnz; 4390 cusparseStatus_t stat; 4391 PetscInt i,m,n,zero = 0; 4392 cudaError_t cerr; 4393 4394 PetscFunctionBegin; 4395 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4396 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4397 PetscValidPointer(C,4); 4398 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4399 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 4400 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 4401 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 4402 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4403 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4404 if (reuse == MAT_INITIAL_MATRIX) { 4405 m = A->rmap->n; 4406 n = A->cmap->n + B->cmap->n; 4407 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 4408 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 4409 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 4410 c = (Mat_SeqAIJ*)(*C)->data; 4411 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4412 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4413 Ccsr = new CsrMatrix; 4414 Cmat->cprowIndices = NULL; 4415 c->compressedrow.use = PETSC_FALSE; 4416 c->compressedrow.nrows = 0; 4417 c->compressedrow.i = NULL; 4418 c->compressedrow.rindex = NULL; 4419 Ccusp->workVector = NULL; 4420 Ccusp->nrows = m; 4421 Ccusp->mat = Cmat; 4422 Ccusp->mat->mat = Ccsr; 4423 Ccsr->num_rows = m; 4424 Ccsr->num_cols = n; 4425 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 4426 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4427 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4428 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4429 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4430 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4431 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4432 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4433 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4434 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4435 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4436 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4437 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4438 4439 Acsr = (CsrMatrix*)Acusp->mat->mat; 4440 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4441 Annz = (PetscInt)Acsr->column_indices->size(); 4442 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4443 c->nz = Annz + Bnnz; 4444 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4445 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4446 Ccsr->values = new THRUSTARRAY(c->nz); 4447 Ccsr->num_entries = c->nz; 4448 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4449 if (c->nz) { 4450 auto Acoo = new THRUSTINTARRAY32(Annz); 4451 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4452 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4453 THRUSTINTARRAY32 *Aroff,*Broff; 4454 4455 if (a->compressedrow.use) { /* need full row offset */ 4456 if (!Acusp->rowoffsets_gpu) { 4457 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4458 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4459 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4460 } 4461 Aroff = Acusp->rowoffsets_gpu; 4462 } else Aroff = Acsr->row_offsets; 4463 if (b->compressedrow.use) { /* need full row offset */ 4464 if (!Bcusp->rowoffsets_gpu) { 4465 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4466 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4467 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4468 } 4469 Broff = Bcusp->rowoffsets_gpu; 4470 } else Broff = Bcsr->row_offsets; 4471 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4472 stat = cusparseXcsr2coo(Acusp->handle, 4473 Aroff->data().get(), 4474 Annz, 4475 m, 4476 Acoo->data().get(), 4477 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4478 stat = cusparseXcsr2coo(Bcusp->handle, 4479 Broff->data().get(), 4480 Bnnz, 4481 m, 4482 Bcoo->data().get(), 4483 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4484 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4485 auto Aperm = thrust::make_constant_iterator(1); 4486 auto Bperm = thrust::make_constant_iterator(0); 4487 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4488 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4489 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4490 #else 4491 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4492 auto Bcib = Bcsr->column_indices->begin(); 4493 auto Bcie = Bcsr->column_indices->end(); 4494 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4495 #endif 4496 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4497 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4498 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4499 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4500 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4501 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4502 auto p1 = Ccusp->cooPerm->begin(); 4503 auto p2 = Ccusp->cooPerm->begin(); 4504 thrust::advance(p2,Annz); 4505 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4506 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4507 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4508 #endif 4509 auto cci = thrust::make_counting_iterator(zero); 4510 auto cce = thrust::make_counting_iterator(c->nz); 4511 #if 0 //Errors on SUMMIT cuda 11.1.0 4512 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4513 #else 4514 auto pred = thrust::identity<int>(); 4515 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4516 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4517 #endif 4518 stat = cusparseXcoo2csr(Ccusp->handle, 4519 Ccoo->data().get(), 4520 c->nz, 4521 m, 4522 Ccsr->row_offsets->data().get(), 4523 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4524 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4525 delete wPerm; 4526 delete Acoo; 4527 delete Bcoo; 4528 delete Ccoo; 4529 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4530 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4531 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4532 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4533 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4534 #endif 4535 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4536 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 4537 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 4538 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4539 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4540 CsrMatrix *CcsrT = new CsrMatrix; 4541 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4542 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4543 4544 (*C)->form_explicit_transpose = PETSC_TRUE; 4545 (*C)->transupdated = PETSC_TRUE; 4546 Ccusp->rowoffsets_gpu = NULL; 4547 CmatT->cprowIndices = NULL; 4548 CmatT->mat = CcsrT; 4549 CcsrT->num_rows = n; 4550 CcsrT->num_cols = m; 4551 CcsrT->num_entries = c->nz; 4552 4553 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4554 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4555 CcsrT->values = new THRUSTARRAY(c->nz); 4556 4557 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4558 auto rT = CcsrT->row_offsets->begin(); 4559 if (AT) { 4560 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4561 thrust::advance(rT,-1); 4562 } 4563 if (BT) { 4564 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4565 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4566 thrust::copy(titb,tite,rT); 4567 } 4568 auto cT = CcsrT->column_indices->begin(); 4569 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4570 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4571 auto vT = CcsrT->values->begin(); 4572 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4573 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4574 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4575 4576 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4577 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4578 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4579 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4580 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4581 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4582 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4583 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4584 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4585 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4586 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4587 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4588 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4589 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4590 #endif 4591 Ccusp->matTranspose = CmatT; 4592 } 4593 } 4594 4595 c->singlemalloc = PETSC_FALSE; 4596 c->free_a = PETSC_TRUE; 4597 c->free_ij = PETSC_TRUE; 4598 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4599 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4600 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4601 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4602 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4603 ii = *Ccsr->row_offsets; 4604 jj = *Ccsr->column_indices; 4605 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4606 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4607 } else { 4608 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4609 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4610 } 4611 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4612 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4613 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4614 c->maxnz = c->nz; 4615 c->nonzerorowcnt = 0; 4616 c->rmax = 0; 4617 for (i = 0; i < m; i++) { 4618 const PetscInt nn = c->i[i+1] - c->i[i]; 4619 c->ilen[i] = c->imax[i] = nn; 4620 c->nonzerorowcnt += (PetscInt)!!nn; 4621 c->rmax = PetscMax(c->rmax,nn); 4622 } 4623 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4624 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4625 (*C)->nonzerostate++; 4626 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4627 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4628 Ccusp->nonzerostate = (*C)->nonzerostate; 4629 (*C)->preallocated = PETSC_TRUE; 4630 } else { 4631 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n); 4632 c = (Mat_SeqAIJ*)(*C)->data; 4633 if (c->nz) { 4634 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4635 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4636 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4637 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4638 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4639 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4640 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4641 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4642 Acsr = (CsrMatrix*)Acusp->mat->mat; 4643 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4644 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4645 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size()); 4646 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4647 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4648 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4649 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4650 auto pmid = Ccusp->cooPerm->begin(); 4651 thrust::advance(pmid,Acsr->num_entries); 4652 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4653 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4654 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4655 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4656 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4657 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4658 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4659 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4660 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4661 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4662 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4663 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4664 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4665 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4666 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4667 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4668 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4669 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4670 auto vT = CcsrT->values->begin(); 4671 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4672 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4673 (*C)->transupdated = PETSC_TRUE; 4674 } 4675 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4676 } 4677 } 4678 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4679 (*C)->assembled = PETSC_TRUE; 4680 (*C)->was_assembled = PETSC_FALSE; 4681 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4682 PetscFunctionReturn(0); 4683 } 4684 4685 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4686 { 4687 PetscErrorCode ierr; 4688 bool dmem; 4689 const PetscScalar *av; 4690 cudaError_t cerr; 4691 4692 PetscFunctionBegin; 4693 dmem = isCudaMem(v); 4694 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4695 if (n && idx) { 4696 THRUSTINTARRAY widx(n); 4697 widx.assign(idx,idx+n); 4698 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4699 4700 THRUSTARRAY *w = NULL; 4701 thrust::device_ptr<PetscScalar> dv; 4702 if (dmem) { 4703 dv = thrust::device_pointer_cast(v); 4704 } else { 4705 w = new THRUSTARRAY(n); 4706 dv = w->data(); 4707 } 4708 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4709 4710 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4711 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4712 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4713 if (w) { 4714 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4715 } 4716 delete w; 4717 } else { 4718 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4719 } 4720 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4721 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4722 PetscFunctionReturn(0); 4723 } 4724