1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/async/for_each.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 76 77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 82 83 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 85 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 86 87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 89 90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 91 92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 93 { 94 cusparseStatus_t stat; 95 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 96 97 PetscFunctionBegin; 98 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 99 cusparsestruct->stream = stream; 100 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 101 PetscFunctionReturn(0); 102 } 103 104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 105 { 106 cusparseStatus_t stat; 107 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 108 109 PetscFunctionBegin; 110 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 111 if (cusparsestruct->handle != handle) { 112 if (cusparsestruct->handle) { 113 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 114 } 115 cusparsestruct->handle = handle; 116 } 117 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 118 PetscFunctionReturn(0); 119 } 120 121 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 122 { 123 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 124 PetscBool flg; 125 PetscErrorCode ierr; 126 127 PetscFunctionBegin; 128 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 129 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 130 if (cusparsestruct->handle) cusparsestruct->handle = 0; 131 PetscFunctionReturn(0); 132 } 133 134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 135 { 136 PetscFunctionBegin; 137 *type = MATSOLVERCUSPARSE; 138 PetscFunctionReturn(0); 139 } 140 141 /*MC 142 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 143 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 144 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 145 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 146 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 147 algorithms are not recommended. This class does NOT support direct solver operations. 148 149 Level: beginner 150 151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 152 M*/ 153 154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 155 { 156 PetscErrorCode ierr; 157 PetscInt n = A->rmap->n; 158 159 PetscFunctionBegin; 160 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 161 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 162 (*B)->factortype = ftype; 163 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 164 165 if (A->boundtocpu && A->bindingpropagates) { ierr = MatBindToCPU(*B,PETSC_TRUE);CHKERRQ(ierr); } 166 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 167 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 168 if (!A->boundtocpu) { 169 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 170 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 171 } else { 172 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 173 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 174 } 175 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 176 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 177 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 178 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 179 if (!A->boundtocpu) { 180 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 181 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 182 } else { 183 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 184 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 185 } 186 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 187 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 188 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 189 190 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 191 (*B)->canuseordering = PETSC_TRUE; 192 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 193 PetscFunctionReturn(0); 194 } 195 196 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 197 { 198 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 199 200 PetscFunctionBegin; 201 switch (op) { 202 case MAT_CUSPARSE_MULT: 203 cusparsestruct->format = format; 204 break; 205 case MAT_CUSPARSE_ALL: 206 cusparsestruct->format = format; 207 break; 208 default: 209 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 210 } 211 PetscFunctionReturn(0); 212 } 213 214 /*@ 215 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 216 operation. Only the MatMult operation can use different GPU storage formats 217 for MPIAIJCUSPARSE matrices. 218 Not Collective 219 220 Input Parameters: 221 + A - Matrix of type SEQAIJCUSPARSE 222 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 223 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 224 225 Output Parameter: 226 227 Level: intermediate 228 229 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 230 @*/ 231 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 232 { 233 PetscErrorCode ierr; 234 235 PetscFunctionBegin; 236 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 237 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 238 PetscFunctionReturn(0); 239 } 240 241 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 242 { 243 PetscErrorCode ierr; 244 245 PetscFunctionBegin; 246 switch (op) { 247 case MAT_FORM_EXPLICIT_TRANSPOSE: 248 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 249 if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 250 A->form_explicit_transpose = flg; 251 break; 252 default: 253 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 254 break; 255 } 256 PetscFunctionReturn(0); 257 } 258 259 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 260 261 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 262 { 263 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 264 IS isrow = b->row,iscol = b->col; 265 PetscBool row_identity,col_identity; 266 PetscErrorCode ierr; 267 268 PetscFunctionBegin; 269 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 270 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 271 B->offloadmask = PETSC_OFFLOAD_CPU; 272 /* determine which version of MatSolve needs to be used. */ 273 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 274 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 275 if (row_identity && col_identity) { 276 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 277 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 278 B->ops->matsolve = NULL; 279 B->ops->matsolvetranspose = NULL; 280 } else { 281 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 282 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 283 B->ops->matsolve = NULL; 284 B->ops->matsolvetranspose = NULL; 285 } 286 287 /* get the triangular factors */ 288 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 289 PetscFunctionReturn(0); 290 } 291 292 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 293 { 294 PetscErrorCode ierr; 295 MatCUSPARSEStorageFormat format; 296 PetscBool flg; 297 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 298 299 PetscFunctionBegin; 300 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 301 if (A->factortype == MAT_FACTOR_NONE) { 302 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 303 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 304 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 305 306 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 307 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 308 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 309 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 310 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 311 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 312 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 313 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 314 if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 315 #else 316 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 317 #endif 318 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 319 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 320 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 321 322 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 323 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 324 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 325 #endif 326 } 327 ierr = PetscOptionsTail();CHKERRQ(ierr); 328 PetscFunctionReturn(0); 329 } 330 331 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 332 { 333 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 334 PetscErrorCode ierr; 335 336 PetscFunctionBegin; 337 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 338 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 339 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 340 PetscFunctionReturn(0); 341 } 342 343 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 344 { 345 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 346 PetscErrorCode ierr; 347 348 PetscFunctionBegin; 349 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 350 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 351 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 352 PetscFunctionReturn(0); 353 } 354 355 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 356 { 357 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 358 PetscErrorCode ierr; 359 360 PetscFunctionBegin; 361 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 362 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 363 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 364 PetscFunctionReturn(0); 365 } 366 367 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 368 { 369 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 370 PetscErrorCode ierr; 371 372 PetscFunctionBegin; 373 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 374 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 375 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 376 PetscFunctionReturn(0); 377 } 378 379 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 380 { 381 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 382 PetscInt n = A->rmap->n; 383 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 384 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 385 cusparseStatus_t stat; 386 const PetscInt *ai = a->i,*aj = a->j,*vi; 387 const MatScalar *aa = a->a,*v; 388 PetscInt *AiLo, *AjLo; 389 PetscInt i,nz, nzLower, offset, rowOffset; 390 PetscErrorCode ierr; 391 cudaError_t cerr; 392 393 PetscFunctionBegin; 394 if (!n) PetscFunctionReturn(0); 395 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 396 try { 397 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 398 nzLower=n+ai[n]-ai[1]; 399 if (!loTriFactor) { 400 PetscScalar *AALo; 401 402 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 403 404 /* Allocate Space for the lower triangular matrix */ 405 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 406 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 407 408 /* Fill the lower triangular matrix */ 409 AiLo[0] = (PetscInt) 0; 410 AiLo[n] = nzLower; 411 AjLo[0] = (PetscInt) 0; 412 AALo[0] = (MatScalar) 1.0; 413 v = aa; 414 vi = aj; 415 offset = 1; 416 rowOffset= 1; 417 for (i=1; i<n; i++) { 418 nz = ai[i+1] - ai[i]; 419 /* additional 1 for the term on the diagonal */ 420 AiLo[i] = rowOffset; 421 rowOffset += nz+1; 422 423 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 424 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 425 426 offset += nz; 427 AjLo[offset] = (PetscInt) i; 428 AALo[offset] = (MatScalar) 1.0; 429 offset += 1; 430 431 v += nz; 432 vi += nz; 433 } 434 435 /* allocate space for the triangular factor information */ 436 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 437 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 438 /* Create the matrix description */ 439 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 440 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 441 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 442 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 443 #else 444 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 445 #endif 446 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 447 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 448 449 /* set the operation */ 450 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 451 452 /* set the matrix */ 453 loTriFactor->csrMat = new CsrMatrix; 454 loTriFactor->csrMat->num_rows = n; 455 loTriFactor->csrMat->num_cols = n; 456 loTriFactor->csrMat->num_entries = nzLower; 457 458 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 459 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 460 461 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 462 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 463 464 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 465 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 466 467 /* Create the solve analysis information */ 468 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 469 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 470 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 471 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 472 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 473 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 474 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 475 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 476 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 477 #endif 478 479 /* perform the solve analysis */ 480 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 481 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 482 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 483 loTriFactor->csrMat->column_indices->data().get(), 484 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 485 loTriFactor->solveInfo, 486 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 487 #else 488 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 489 #endif 490 cerr = WaitForCUDA();CHKERRCUDA(cerr); 491 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 492 493 /* assign the pointer */ 494 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 495 loTriFactor->AA_h = AALo; 496 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 497 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 498 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 499 } else { /* update values only */ 500 if (!loTriFactor->AA_h) { 501 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 502 } 503 /* Fill the lower triangular matrix */ 504 loTriFactor->AA_h[0] = 1.0; 505 v = aa; 506 vi = aj; 507 offset = 1; 508 for (i=1; i<n; i++) { 509 nz = ai[i+1] - ai[i]; 510 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 511 offset += nz; 512 loTriFactor->AA_h[offset] = 1.0; 513 offset += 1; 514 v += nz; 515 } 516 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 517 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 518 } 519 } catch(char *ex) { 520 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 521 } 522 } 523 PetscFunctionReturn(0); 524 } 525 526 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 527 { 528 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 529 PetscInt n = A->rmap->n; 530 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 531 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 532 cusparseStatus_t stat; 533 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 534 const MatScalar *aa = a->a,*v; 535 PetscInt *AiUp, *AjUp; 536 PetscInt i,nz, nzUpper, offset; 537 PetscErrorCode ierr; 538 cudaError_t cerr; 539 540 PetscFunctionBegin; 541 if (!n) PetscFunctionReturn(0); 542 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 543 try { 544 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 545 nzUpper = adiag[0]-adiag[n]; 546 if (!upTriFactor) { 547 PetscScalar *AAUp; 548 549 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 550 551 /* Allocate Space for the upper triangular matrix */ 552 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 553 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 554 555 /* Fill the upper triangular matrix */ 556 AiUp[0]=(PetscInt) 0; 557 AiUp[n]=nzUpper; 558 offset = nzUpper; 559 for (i=n-1; i>=0; i--) { 560 v = aa + adiag[i+1] + 1; 561 vi = aj + adiag[i+1] + 1; 562 563 /* number of elements NOT on the diagonal */ 564 nz = adiag[i] - adiag[i+1]-1; 565 566 /* decrement the offset */ 567 offset -= (nz+1); 568 569 /* first, set the diagonal elements */ 570 AjUp[offset] = (PetscInt) i; 571 AAUp[offset] = (MatScalar)1./v[nz]; 572 AiUp[i] = AiUp[i+1] - (nz+1); 573 574 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 575 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 576 } 577 578 /* allocate space for the triangular factor information */ 579 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 580 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 581 582 /* Create the matrix description */ 583 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 584 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 585 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 586 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 587 #else 588 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 589 #endif 590 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 591 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 592 593 /* set the operation */ 594 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 595 596 /* set the matrix */ 597 upTriFactor->csrMat = new CsrMatrix; 598 upTriFactor->csrMat->num_rows = n; 599 upTriFactor->csrMat->num_cols = n; 600 upTriFactor->csrMat->num_entries = nzUpper; 601 602 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 603 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 604 605 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 606 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 607 608 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 609 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 610 611 /* Create the solve analysis information */ 612 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 613 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 614 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 615 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 616 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 617 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 618 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 619 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 620 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 621 #endif 622 623 /* perform the solve analysis */ 624 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 625 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 626 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 627 upTriFactor->csrMat->column_indices->data().get(), 628 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 629 upTriFactor->solveInfo, 630 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 631 #else 632 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 633 #endif 634 cerr = WaitForCUDA();CHKERRCUDA(cerr); 635 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 636 637 /* assign the pointer */ 638 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 639 upTriFactor->AA_h = AAUp; 640 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 641 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 642 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 643 } else { 644 if (!upTriFactor->AA_h) { 645 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 646 } 647 /* Fill the upper triangular matrix */ 648 offset = nzUpper; 649 for (i=n-1; i>=0; i--) { 650 v = aa + adiag[i+1] + 1; 651 652 /* number of elements NOT on the diagonal */ 653 nz = adiag[i] - adiag[i+1]-1; 654 655 /* decrement the offset */ 656 offset -= (nz+1); 657 658 /* first, set the diagonal elements */ 659 upTriFactor->AA_h[offset] = 1./v[nz]; 660 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 661 } 662 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 663 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 664 } 665 } catch(char *ex) { 666 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 667 } 668 } 669 PetscFunctionReturn(0); 670 } 671 672 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 673 { 674 PetscErrorCode ierr; 675 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 676 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 677 IS isrow = a->row,iscol = a->icol; 678 PetscBool row_identity,col_identity; 679 PetscInt n = A->rmap->n; 680 681 PetscFunctionBegin; 682 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 683 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 684 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 685 686 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 687 cusparseTriFactors->nnz=a->nz; 688 689 A->offloadmask = PETSC_OFFLOAD_BOTH; 690 /* lower triangular indices */ 691 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 692 if (!row_identity && !cusparseTriFactors->rpermIndices) { 693 const PetscInt *r; 694 695 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 696 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 697 cusparseTriFactors->rpermIndices->assign(r, r+n); 698 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 699 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 700 } 701 702 /* upper triangular indices */ 703 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 704 if (!col_identity && !cusparseTriFactors->cpermIndices) { 705 const PetscInt *c; 706 707 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 708 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 709 cusparseTriFactors->cpermIndices->assign(c, c+n); 710 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 711 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 712 } 713 PetscFunctionReturn(0); 714 } 715 716 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 717 { 718 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 719 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 720 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 721 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 722 cusparseStatus_t stat; 723 PetscErrorCode ierr; 724 cudaError_t cerr; 725 PetscInt *AiUp, *AjUp; 726 PetscScalar *AAUp; 727 PetscScalar *AALo; 728 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 729 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 730 const PetscInt *ai = b->i,*aj = b->j,*vj; 731 const MatScalar *aa = b->a,*v; 732 733 PetscFunctionBegin; 734 if (!n) PetscFunctionReturn(0); 735 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 736 try { 737 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 738 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 739 if (!upTriFactor && !loTriFactor) { 740 /* Allocate Space for the upper triangular matrix */ 741 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 742 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 743 744 /* Fill the upper triangular matrix */ 745 AiUp[0]=(PetscInt) 0; 746 AiUp[n]=nzUpper; 747 offset = 0; 748 for (i=0; i<n; i++) { 749 /* set the pointers */ 750 v = aa + ai[i]; 751 vj = aj + ai[i]; 752 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 753 754 /* first, set the diagonal elements */ 755 AjUp[offset] = (PetscInt) i; 756 AAUp[offset] = (MatScalar)1.0/v[nz]; 757 AiUp[i] = offset; 758 AALo[offset] = (MatScalar)1.0/v[nz]; 759 760 offset+=1; 761 if (nz>0) { 762 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 763 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 764 for (j=offset; j<offset+nz; j++) { 765 AAUp[j] = -AAUp[j]; 766 AALo[j] = AAUp[j]/v[nz]; 767 } 768 offset+=nz; 769 } 770 } 771 772 /* allocate space for the triangular factor information */ 773 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 774 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 775 776 /* Create the matrix description */ 777 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 778 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 779 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 780 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 781 #else 782 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 783 #endif 784 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 785 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 786 787 /* set the matrix */ 788 upTriFactor->csrMat = new CsrMatrix; 789 upTriFactor->csrMat->num_rows = A->rmap->n; 790 upTriFactor->csrMat->num_cols = A->cmap->n; 791 upTriFactor->csrMat->num_entries = a->nz; 792 793 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 794 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 795 796 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 797 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 798 799 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 800 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 801 802 /* set the operation */ 803 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 804 805 /* Create the solve analysis information */ 806 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 807 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 808 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 809 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 810 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 811 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 812 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 813 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 814 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 815 #endif 816 817 /* perform the solve analysis */ 818 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 819 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 820 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 821 upTriFactor->csrMat->column_indices->data().get(), 822 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 823 upTriFactor->solveInfo, 824 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 825 #else 826 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 827 #endif 828 cerr = WaitForCUDA();CHKERRCUDA(cerr); 829 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 830 831 /* assign the pointer */ 832 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 833 834 /* allocate space for the triangular factor information */ 835 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 836 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 837 838 /* Create the matrix description */ 839 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 840 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 841 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 842 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 843 #else 844 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 845 #endif 846 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 847 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 848 849 /* set the operation */ 850 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 851 852 /* set the matrix */ 853 loTriFactor->csrMat = new CsrMatrix; 854 loTriFactor->csrMat->num_rows = A->rmap->n; 855 loTriFactor->csrMat->num_cols = A->cmap->n; 856 loTriFactor->csrMat->num_entries = a->nz; 857 858 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 859 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 860 861 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 862 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 863 864 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 865 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 866 867 /* Create the solve analysis information */ 868 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 869 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 870 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 871 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 872 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 873 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 874 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 875 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 876 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 877 #endif 878 879 /* perform the solve analysis */ 880 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 881 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 882 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 883 loTriFactor->csrMat->column_indices->data().get(), 884 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 885 loTriFactor->solveInfo, 886 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 887 #else 888 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 889 #endif 890 cerr = WaitForCUDA();CHKERRCUDA(cerr); 891 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 892 893 /* assign the pointer */ 894 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 895 896 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 897 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 898 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 899 } else { 900 /* Fill the upper triangular matrix */ 901 offset = 0; 902 for (i=0; i<n; i++) { 903 /* set the pointers */ 904 v = aa + ai[i]; 905 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 906 907 /* first, set the diagonal elements */ 908 AAUp[offset] = 1.0/v[nz]; 909 AALo[offset] = 1.0/v[nz]; 910 911 offset+=1; 912 if (nz>0) { 913 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 914 for (j=offset; j<offset+nz; j++) { 915 AAUp[j] = -AAUp[j]; 916 AALo[j] = AAUp[j]/v[nz]; 917 } 918 offset+=nz; 919 } 920 } 921 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 922 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 923 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 924 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 925 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 926 } 927 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 928 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 929 } catch(char *ex) { 930 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 931 } 932 } 933 PetscFunctionReturn(0); 934 } 935 936 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 937 { 938 PetscErrorCode ierr; 939 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 940 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 941 IS ip = a->row; 942 PetscBool perm_identity; 943 PetscInt n = A->rmap->n; 944 945 PetscFunctionBegin; 946 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 947 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 948 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 949 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 950 951 A->offloadmask = PETSC_OFFLOAD_BOTH; 952 953 /* lower triangular indices */ 954 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 955 if (!perm_identity) { 956 IS iip; 957 const PetscInt *irip,*rip; 958 959 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 960 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 961 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 962 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 963 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 964 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 965 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 966 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 967 ierr = ISDestroy(&iip);CHKERRQ(ierr); 968 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 969 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 970 } 971 PetscFunctionReturn(0); 972 } 973 974 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 975 { 976 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 977 IS ip = b->row; 978 PetscBool perm_identity; 979 PetscErrorCode ierr; 980 981 PetscFunctionBegin; 982 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 983 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 984 B->offloadmask = PETSC_OFFLOAD_CPU; 985 /* determine which version of MatSolve needs to be used. */ 986 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 987 if (perm_identity) { 988 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 989 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 990 B->ops->matsolve = NULL; 991 B->ops->matsolvetranspose = NULL; 992 } else { 993 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 994 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 995 B->ops->matsolve = NULL; 996 B->ops->matsolvetranspose = NULL; 997 } 998 999 /* get the triangular factors */ 1000 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 1001 PetscFunctionReturn(0); 1002 } 1003 1004 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1005 { 1006 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1007 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1008 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1009 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1010 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1011 cusparseStatus_t stat; 1012 cusparseIndexBase_t indexBase; 1013 cusparseMatrixType_t matrixType; 1014 cusparseFillMode_t fillMode; 1015 cusparseDiagType_t diagType; 1016 cudaError_t cerr; 1017 PetscErrorCode ierr; 1018 1019 PetscFunctionBegin; 1020 /* allocate space for the transpose of the lower triangular factor */ 1021 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1022 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1023 1024 /* set the matrix descriptors of the lower triangular factor */ 1025 matrixType = cusparseGetMatType(loTriFactor->descr); 1026 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1027 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1028 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1029 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1030 1031 /* Create the matrix description */ 1032 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1033 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1034 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1035 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1036 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1037 1038 /* set the operation */ 1039 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1040 1041 /* allocate GPU space for the CSC of the lower triangular factor*/ 1042 loTriFactorT->csrMat = new CsrMatrix; 1043 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1044 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1045 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1046 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1047 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1048 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1049 1050 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1051 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1052 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1053 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1054 loTriFactor->csrMat->values->data().get(), 1055 loTriFactor->csrMat->row_offsets->data().get(), 1056 loTriFactor->csrMat->column_indices->data().get(), 1057 loTriFactorT->csrMat->values->data().get(), 1058 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1059 CUSPARSE_ACTION_NUMERIC,indexBase, 1060 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1061 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1062 #endif 1063 1064 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1065 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1066 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1067 loTriFactor->csrMat->values->data().get(), 1068 loTriFactor->csrMat->row_offsets->data().get(), 1069 loTriFactor->csrMat->column_indices->data().get(), 1070 loTriFactorT->csrMat->values->data().get(), 1071 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1072 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1073 CUSPARSE_ACTION_NUMERIC, indexBase, 1074 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1075 #else 1076 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1077 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1078 #endif 1079 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1080 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1081 1082 /* Create the solve analysis information */ 1083 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1084 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1085 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1086 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1087 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1088 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1089 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1090 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1091 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1092 #endif 1093 1094 /* perform the solve analysis */ 1095 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1096 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1097 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1098 loTriFactorT->csrMat->column_indices->data().get(), 1099 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1100 loTriFactorT->solveInfo, 1101 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1102 #else 1103 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1104 #endif 1105 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1106 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1107 1108 /* assign the pointer */ 1109 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1110 1111 /*********************************************/ 1112 /* Now the Transpose of the Upper Tri Factor */ 1113 /*********************************************/ 1114 1115 /* allocate space for the transpose of the upper triangular factor */ 1116 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1117 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1118 1119 /* set the matrix descriptors of the upper triangular factor */ 1120 matrixType = cusparseGetMatType(upTriFactor->descr); 1121 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1122 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1123 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1124 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1125 1126 /* Create the matrix description */ 1127 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1128 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1129 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1130 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1131 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1132 1133 /* set the operation */ 1134 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1135 1136 /* allocate GPU space for the CSC of the upper triangular factor*/ 1137 upTriFactorT->csrMat = new CsrMatrix; 1138 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1139 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1140 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1141 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1142 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1143 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1144 1145 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1146 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1147 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1148 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1149 upTriFactor->csrMat->values->data().get(), 1150 upTriFactor->csrMat->row_offsets->data().get(), 1151 upTriFactor->csrMat->column_indices->data().get(), 1152 upTriFactorT->csrMat->values->data().get(), 1153 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1154 CUSPARSE_ACTION_NUMERIC,indexBase, 1155 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1156 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1157 #endif 1158 1159 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1160 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1161 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1162 upTriFactor->csrMat->values->data().get(), 1163 upTriFactor->csrMat->row_offsets->data().get(), 1164 upTriFactor->csrMat->column_indices->data().get(), 1165 upTriFactorT->csrMat->values->data().get(), 1166 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1167 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1168 CUSPARSE_ACTION_NUMERIC, indexBase, 1169 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1170 #else 1171 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1172 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1173 #endif 1174 1175 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1176 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1177 1178 /* Create the solve analysis information */ 1179 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1180 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1181 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1182 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1183 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1184 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1185 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1186 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1187 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1188 #endif 1189 1190 /* perform the solve analysis */ 1191 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1192 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1193 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1194 upTriFactorT->csrMat->column_indices->data().get(), 1195 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1196 upTriFactorT->solveInfo, 1197 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1198 #else 1199 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1200 #endif 1201 1202 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1203 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1204 1205 /* assign the pointer */ 1206 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1207 PetscFunctionReturn(0); 1208 } 1209 1210 struct PetscScalarToPetscInt 1211 { 1212 __host__ __device__ 1213 PetscInt operator()(PetscScalar s) 1214 { 1215 return (PetscInt)PetscRealPart(s); 1216 } 1217 }; 1218 1219 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1220 { 1221 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1222 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1223 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1224 cusparseStatus_t stat; 1225 cusparseIndexBase_t indexBase; 1226 cudaError_t err; 1227 PetscErrorCode ierr; 1228 1229 PetscFunctionBegin; 1230 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1231 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1232 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1233 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1234 if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1235 if (A->transupdated) PetscFunctionReturn(0); 1236 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1237 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1238 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1239 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1240 } 1241 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1242 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1243 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1244 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1245 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1246 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1247 1248 /* set alpha and beta */ 1249 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1250 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1251 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1252 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1253 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1254 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1255 1256 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1257 CsrMatrix *matrixT = new CsrMatrix; 1258 matstructT->mat = matrixT; 1259 matrixT->num_rows = A->cmap->n; 1260 matrixT->num_cols = A->rmap->n; 1261 matrixT->num_entries = a->nz; 1262 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1263 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1264 matrixT->values = new THRUSTARRAY(a->nz); 1265 1266 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1267 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1268 1269 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1270 #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1271 stat = cusparseCreateCsr(&matstructT->matDescr, 1272 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1273 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1274 matrixT->values->data().get(), 1275 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1276 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1277 #else 1278 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1279 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1280 1281 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1282 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1283 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1284 */ 1285 if (matrixT->num_entries) { 1286 stat = cusparseCreateCsr(&matstructT->matDescr, 1287 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1288 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1289 matrixT->values->data().get(), 1290 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 1291 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1292 1293 } else { 1294 matstructT->matDescr = NULL; 1295 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1296 } 1297 #endif 1298 #endif 1299 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1300 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1301 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1302 #else 1303 CsrMatrix *temp = new CsrMatrix; 1304 CsrMatrix *tempT = new CsrMatrix; 1305 /* First convert HYB to CSR */ 1306 temp->num_rows = A->rmap->n; 1307 temp->num_cols = A->cmap->n; 1308 temp->num_entries = a->nz; 1309 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1310 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1311 temp->values = new THRUSTARRAY(a->nz); 1312 1313 stat = cusparse_hyb2csr(cusparsestruct->handle, 1314 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1315 temp->values->data().get(), 1316 temp->row_offsets->data().get(), 1317 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1318 1319 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1320 tempT->num_rows = A->rmap->n; 1321 tempT->num_cols = A->cmap->n; 1322 tempT->num_entries = a->nz; 1323 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1324 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1325 tempT->values = new THRUSTARRAY(a->nz); 1326 1327 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1328 temp->num_cols, temp->num_entries, 1329 temp->values->data().get(), 1330 temp->row_offsets->data().get(), 1331 temp->column_indices->data().get(), 1332 tempT->values->data().get(), 1333 tempT->column_indices->data().get(), 1334 tempT->row_offsets->data().get(), 1335 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1336 1337 /* Last, convert CSC to HYB */ 1338 cusparseHybMat_t hybMat; 1339 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1340 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1341 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1342 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1343 matstructT->descr, tempT->values->data().get(), 1344 tempT->row_offsets->data().get(), 1345 tempT->column_indices->data().get(), 1346 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1347 1348 /* assign the pointer */ 1349 matstructT->mat = hybMat; 1350 A->transupdated = PETSC_TRUE; 1351 /* delete temporaries */ 1352 if (tempT) { 1353 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1354 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1355 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1356 delete (CsrMatrix*) tempT; 1357 } 1358 if (temp) { 1359 if (temp->values) delete (THRUSTARRAY*) temp->values; 1360 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1361 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1362 delete (CsrMatrix*) temp; 1363 } 1364 #endif 1365 } 1366 } 1367 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1368 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1369 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1370 if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1371 if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1372 if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1373 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1374 if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1375 if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1376 if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1377 if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1378 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1379 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1380 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1381 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1382 } 1383 if (!cusparsestruct->csr2csc_i) { 1384 THRUSTARRAY csr2csc_a(matrix->num_entries); 1385 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1386 1387 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1388 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1389 void *csr2cscBuffer; 1390 size_t csr2cscBufferSize; 1391 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1392 A->cmap->n, matrix->num_entries, 1393 matrix->values->data().get(), 1394 cusparsestruct->rowoffsets_gpu->data().get(), 1395 matrix->column_indices->data().get(), 1396 matrixT->values->data().get(), 1397 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1398 CUSPARSE_ACTION_NUMERIC,indexBase, 1399 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1400 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1401 #endif 1402 1403 if (matrix->num_entries) { 1404 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1405 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1406 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1407 1408 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1409 should be filled with indexBase. So I just take a shortcut here. 1410 */ 1411 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1412 A->cmap->n,matrix->num_entries, 1413 csr2csc_a.data().get(), 1414 cusparsestruct->rowoffsets_gpu->data().get(), 1415 matrix->column_indices->data().get(), 1416 matrixT->values->data().get(), 1417 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1418 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1419 CUSPARSE_ACTION_NUMERIC,indexBase, 1420 cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1421 #else 1422 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1423 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1424 #endif 1425 } else { 1426 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1427 } 1428 1429 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1430 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1431 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1432 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1433 #endif 1434 } 1435 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1436 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1437 matrixT->values->begin())); 1438 } 1439 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1440 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1441 /* the compressed row indices is not used for matTranspose */ 1442 matstructT->cprowIndices = NULL; 1443 /* assign the pointer */ 1444 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1445 A->transupdated = PETSC_TRUE; 1446 PetscFunctionReturn(0); 1447 } 1448 1449 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1450 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1451 { 1452 PetscInt n = xx->map->n; 1453 const PetscScalar *barray; 1454 PetscScalar *xarray; 1455 thrust::device_ptr<const PetscScalar> bGPU; 1456 thrust::device_ptr<PetscScalar> xGPU; 1457 cusparseStatus_t stat; 1458 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1459 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1460 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1461 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1462 PetscErrorCode ierr; 1463 1464 PetscFunctionBegin; 1465 /* Analyze the matrix and create the transpose ... on the fly */ 1466 if (!loTriFactorT && !upTriFactorT) { 1467 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1468 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1469 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1470 } 1471 1472 /* Get the GPU pointers */ 1473 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1474 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1475 xGPU = thrust::device_pointer_cast(xarray); 1476 bGPU = thrust::device_pointer_cast(barray); 1477 1478 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1479 /* First, reorder with the row permutation */ 1480 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1481 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1482 xGPU); 1483 1484 /* First, solve U */ 1485 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1486 upTriFactorT->csrMat->num_rows, 1487 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1488 upTriFactorT->csrMat->num_entries, 1489 #endif 1490 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1491 upTriFactorT->csrMat->values->data().get(), 1492 upTriFactorT->csrMat->row_offsets->data().get(), 1493 upTriFactorT->csrMat->column_indices->data().get(), 1494 upTriFactorT->solveInfo, 1495 xarray, 1496 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1497 tempGPU->data().get(), 1498 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1499 #else 1500 tempGPU->data().get());CHKERRCUSPARSE(stat); 1501 #endif 1502 1503 /* Then, solve L */ 1504 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1505 loTriFactorT->csrMat->num_rows, 1506 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1507 loTriFactorT->csrMat->num_entries, 1508 #endif 1509 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1510 loTriFactorT->csrMat->values->data().get(), 1511 loTriFactorT->csrMat->row_offsets->data().get(), 1512 loTriFactorT->csrMat->column_indices->data().get(), 1513 loTriFactorT->solveInfo, 1514 tempGPU->data().get(), 1515 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1516 xarray, 1517 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1518 #else 1519 xarray);CHKERRCUSPARSE(stat); 1520 #endif 1521 1522 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1523 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1524 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1525 tempGPU->begin()); 1526 1527 /* Copy the temporary to the full solution. */ 1528 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1529 1530 /* restore */ 1531 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1532 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1533 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1534 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1535 PetscFunctionReturn(0); 1536 } 1537 1538 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1539 { 1540 const PetscScalar *barray; 1541 PetscScalar *xarray; 1542 cusparseStatus_t stat; 1543 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1544 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1545 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1546 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1547 PetscErrorCode ierr; 1548 1549 PetscFunctionBegin; 1550 /* Analyze the matrix and create the transpose ... on the fly */ 1551 if (!loTriFactorT && !upTriFactorT) { 1552 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1553 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1554 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1555 } 1556 1557 /* Get the GPU pointers */ 1558 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1559 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1560 1561 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1562 /* First, solve U */ 1563 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1564 upTriFactorT->csrMat->num_rows, 1565 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1566 upTriFactorT->csrMat->num_entries, 1567 #endif 1568 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1569 upTriFactorT->csrMat->values->data().get(), 1570 upTriFactorT->csrMat->row_offsets->data().get(), 1571 upTriFactorT->csrMat->column_indices->data().get(), 1572 upTriFactorT->solveInfo, 1573 barray, 1574 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1575 tempGPU->data().get(), 1576 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1577 #else 1578 tempGPU->data().get());CHKERRCUSPARSE(stat); 1579 #endif 1580 1581 /* Then, solve L */ 1582 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1583 loTriFactorT->csrMat->num_rows, 1584 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1585 loTriFactorT->csrMat->num_entries, 1586 #endif 1587 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1588 loTriFactorT->csrMat->values->data().get(), 1589 loTriFactorT->csrMat->row_offsets->data().get(), 1590 loTriFactorT->csrMat->column_indices->data().get(), 1591 loTriFactorT->solveInfo, 1592 tempGPU->data().get(), 1593 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1594 xarray, 1595 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1596 #else 1597 xarray);CHKERRCUSPARSE(stat); 1598 #endif 1599 1600 /* restore */ 1601 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1602 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1603 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1604 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1605 PetscFunctionReturn(0); 1606 } 1607 1608 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1609 { 1610 const PetscScalar *barray; 1611 PetscScalar *xarray; 1612 thrust::device_ptr<const PetscScalar> bGPU; 1613 thrust::device_ptr<PetscScalar> xGPU; 1614 cusparseStatus_t stat; 1615 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1616 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1617 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1618 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1619 PetscErrorCode ierr; 1620 1621 PetscFunctionBegin; 1622 1623 /* Get the GPU pointers */ 1624 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1625 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1626 xGPU = thrust::device_pointer_cast(xarray); 1627 bGPU = thrust::device_pointer_cast(barray); 1628 1629 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1630 /* First, reorder with the row permutation */ 1631 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1632 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1633 tempGPU->begin()); 1634 1635 /* Next, solve L */ 1636 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1637 loTriFactor->csrMat->num_rows, 1638 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1639 loTriFactor->csrMat->num_entries, 1640 #endif 1641 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1642 loTriFactor->csrMat->values->data().get(), 1643 loTriFactor->csrMat->row_offsets->data().get(), 1644 loTriFactor->csrMat->column_indices->data().get(), 1645 loTriFactor->solveInfo, 1646 tempGPU->data().get(), 1647 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1648 xarray, 1649 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1650 #else 1651 xarray);CHKERRCUSPARSE(stat); 1652 #endif 1653 1654 /* Then, solve U */ 1655 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1656 upTriFactor->csrMat->num_rows, 1657 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1658 upTriFactor->csrMat->num_entries, 1659 #endif 1660 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1661 upTriFactor->csrMat->values->data().get(), 1662 upTriFactor->csrMat->row_offsets->data().get(), 1663 upTriFactor->csrMat->column_indices->data().get(), 1664 upTriFactor->solveInfo,xarray, 1665 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1666 tempGPU->data().get(), 1667 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1668 #else 1669 tempGPU->data().get());CHKERRCUSPARSE(stat); 1670 #endif 1671 1672 /* Last, reorder with the column permutation */ 1673 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1674 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1675 xGPU); 1676 1677 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1678 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1679 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1680 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1681 PetscFunctionReturn(0); 1682 } 1683 1684 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1685 { 1686 const PetscScalar *barray; 1687 PetscScalar *xarray; 1688 cusparseStatus_t stat; 1689 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1690 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1691 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1692 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1693 PetscErrorCode ierr; 1694 1695 PetscFunctionBegin; 1696 /* Get the GPU pointers */ 1697 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1698 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1699 1700 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1701 /* First, solve L */ 1702 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1703 loTriFactor->csrMat->num_rows, 1704 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1705 loTriFactor->csrMat->num_entries, 1706 #endif 1707 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1708 loTriFactor->csrMat->values->data().get(), 1709 loTriFactor->csrMat->row_offsets->data().get(), 1710 loTriFactor->csrMat->column_indices->data().get(), 1711 loTriFactor->solveInfo, 1712 barray, 1713 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1714 tempGPU->data().get(), 1715 loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1716 #else 1717 tempGPU->data().get());CHKERRCUSPARSE(stat); 1718 #endif 1719 1720 /* Next, solve U */ 1721 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1722 upTriFactor->csrMat->num_rows, 1723 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1724 upTriFactor->csrMat->num_entries, 1725 #endif 1726 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1727 upTriFactor->csrMat->values->data().get(), 1728 upTriFactor->csrMat->row_offsets->data().get(), 1729 upTriFactor->csrMat->column_indices->data().get(), 1730 upTriFactor->solveInfo, 1731 tempGPU->data().get(), 1732 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1733 xarray, 1734 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1735 #else 1736 xarray);CHKERRCUSPARSE(stat); 1737 #endif 1738 1739 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1740 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1741 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1742 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1743 PetscFunctionReturn(0); 1744 } 1745 1746 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1747 { 1748 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1749 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1750 cudaError_t cerr; 1751 PetscErrorCode ierr; 1752 1753 PetscFunctionBegin; 1754 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1755 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1756 1757 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1758 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1759 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1760 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1761 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1762 A->offloadmask = PETSC_OFFLOAD_BOTH; 1763 } 1764 PetscFunctionReturn(0); 1765 } 1766 1767 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1768 { 1769 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1770 PetscErrorCode ierr; 1771 1772 PetscFunctionBegin; 1773 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1774 *array = a->a; 1775 A->offloadmask = PETSC_OFFLOAD_CPU; 1776 PetscFunctionReturn(0); 1777 } 1778 1779 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1780 { 1781 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1782 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1783 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1784 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1785 PetscErrorCode ierr; 1786 cusparseStatus_t stat; 1787 PetscBool both = PETSC_TRUE; 1788 cudaError_t err; 1789 1790 PetscFunctionBegin; 1791 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1792 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1793 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1794 CsrMatrix *matrix; 1795 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1796 1797 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1798 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1799 matrix->values->assign(a->a, a->a+a->nz); 1800 err = WaitForCUDA();CHKERRCUDA(err); 1801 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1802 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1803 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1804 } else { 1805 PetscInt nnz; 1806 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1807 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1808 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1809 delete cusparsestruct->workVector; 1810 delete cusparsestruct->rowoffsets_gpu; 1811 cusparsestruct->workVector = NULL; 1812 cusparsestruct->rowoffsets_gpu = NULL; 1813 try { 1814 if (a->compressedrow.use) { 1815 m = a->compressedrow.nrows; 1816 ii = a->compressedrow.i; 1817 ridx = a->compressedrow.rindex; 1818 } else { 1819 m = A->rmap->n; 1820 ii = a->i; 1821 ridx = NULL; 1822 } 1823 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1824 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1825 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1826 else nnz = a->nz; 1827 1828 /* create cusparse matrix */ 1829 cusparsestruct->nrows = m; 1830 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1831 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1832 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1833 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1834 1835 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1836 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1837 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1838 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1839 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1840 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1841 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1842 1843 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1844 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1845 /* set the matrix */ 1846 CsrMatrix *mat= new CsrMatrix; 1847 mat->num_rows = m; 1848 mat->num_cols = A->cmap->n; 1849 mat->num_entries = nnz; 1850 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1851 mat->row_offsets->assign(ii, ii + m+1); 1852 1853 mat->column_indices = new THRUSTINTARRAY32(nnz); 1854 mat->column_indices->assign(a->j, a->j+nnz); 1855 1856 mat->values = new THRUSTARRAY(nnz); 1857 if (a->a) mat->values->assign(a->a, a->a+nnz); 1858 1859 /* assign the pointer */ 1860 matstruct->mat = mat; 1861 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1862 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1863 stat = cusparseCreateCsr(&matstruct->matDescr, 1864 mat->num_rows, mat->num_cols, mat->num_entries, 1865 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1866 mat->values->data().get(), 1867 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1868 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1869 } 1870 #endif 1871 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1872 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1873 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1874 #else 1875 CsrMatrix *mat= new CsrMatrix; 1876 mat->num_rows = m; 1877 mat->num_cols = A->cmap->n; 1878 mat->num_entries = nnz; 1879 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1880 mat->row_offsets->assign(ii, ii + m+1); 1881 1882 mat->column_indices = new THRUSTINTARRAY32(nnz); 1883 mat->column_indices->assign(a->j, a->j+nnz); 1884 1885 mat->values = new THRUSTARRAY(nnz); 1886 if (a->a) mat->values->assign(a->a, a->a+nnz); 1887 1888 cusparseHybMat_t hybMat; 1889 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1890 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1891 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1892 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1893 matstruct->descr, mat->values->data().get(), 1894 mat->row_offsets->data().get(), 1895 mat->column_indices->data().get(), 1896 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1897 /* assign the pointer */ 1898 matstruct->mat = hybMat; 1899 1900 if (mat) { 1901 if (mat->values) delete (THRUSTARRAY*)mat->values; 1902 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1903 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1904 delete (CsrMatrix*)mat; 1905 } 1906 #endif 1907 } 1908 1909 /* assign the compressed row indices */ 1910 if (a->compressedrow.use) { 1911 cusparsestruct->workVector = new THRUSTARRAY(m); 1912 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1913 matstruct->cprowIndices->assign(ridx,ridx+m); 1914 tmp = m; 1915 } else { 1916 cusparsestruct->workVector = NULL; 1917 matstruct->cprowIndices = NULL; 1918 tmp = 0; 1919 } 1920 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1921 1922 /* assign the pointer */ 1923 cusparsestruct->mat = matstruct; 1924 } catch(char *ex) { 1925 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1926 } 1927 err = WaitForCUDA();CHKERRCUDA(err); 1928 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1929 cusparsestruct->nonzerostate = A->nonzerostate; 1930 } 1931 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1932 } 1933 PetscFunctionReturn(0); 1934 } 1935 1936 struct VecCUDAPlusEquals 1937 { 1938 template <typename Tuple> 1939 __host__ __device__ 1940 void operator()(Tuple t) 1941 { 1942 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1943 } 1944 }; 1945 1946 struct VecCUDAEquals 1947 { 1948 template <typename Tuple> 1949 __host__ __device__ 1950 void operator()(Tuple t) 1951 { 1952 thrust::get<1>(t) = thrust::get<0>(t); 1953 } 1954 }; 1955 1956 struct VecCUDAEqualsReverse 1957 { 1958 template <typename Tuple> 1959 __host__ __device__ 1960 void operator()(Tuple t) 1961 { 1962 thrust::get<0>(t) = thrust::get<1>(t); 1963 } 1964 }; 1965 1966 struct MatMatCusparse { 1967 PetscBool cisdense; 1968 PetscScalar *Bt; 1969 Mat X; 1970 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1971 PetscLogDouble flops; 1972 CsrMatrix *Bcsr; 1973 1974 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1975 cusparseSpMatDescr_t matSpBDescr; 1976 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1977 cusparseDnMatDescr_t matBDescr; 1978 cusparseDnMatDescr_t matCDescr; 1979 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1980 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 1981 void *dBuffer4; 1982 void *dBuffer5; 1983 #endif 1984 size_t mmBufferSize; 1985 void *mmBuffer; 1986 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1987 cusparseSpGEMMDescr_t spgemmDesc; 1988 #endif 1989 }; 1990 1991 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1992 { 1993 PetscErrorCode ierr; 1994 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1995 cudaError_t cerr; 1996 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1997 cusparseStatus_t stat; 1998 #endif 1999 2000 PetscFunctionBegin; 2001 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 2002 delete mmdata->Bcsr; 2003 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2004 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 2005 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 2006 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 2007 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 2008 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2009 if (mmdata->dBuffer4) { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); } 2010 if (mmdata->dBuffer5) { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); } 2011 #endif 2012 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 2013 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 2014 #endif 2015 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 2016 ierr = PetscFree(data);CHKERRQ(ierr); 2017 PetscFunctionReturn(0); 2018 } 2019 2020 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2021 2022 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2023 { 2024 Mat_Product *product = C->product; 2025 Mat A,B; 2026 PetscInt m,n,blda,clda; 2027 PetscBool flg,biscuda; 2028 Mat_SeqAIJCUSPARSE *cusp; 2029 cusparseStatus_t stat; 2030 cusparseOperation_t opA; 2031 const PetscScalar *barray; 2032 PetscScalar *carray; 2033 PetscErrorCode ierr; 2034 MatMatCusparse *mmdata; 2035 Mat_SeqAIJCUSPARSEMultStruct *mat; 2036 CsrMatrix *csrmat; 2037 2038 PetscFunctionBegin; 2039 MatCheckProduct(C,1); 2040 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2041 mmdata = (MatMatCusparse*)product->data; 2042 A = product->A; 2043 B = product->B; 2044 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2045 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2046 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2047 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2048 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2049 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2050 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2051 switch (product->type) { 2052 case MATPRODUCT_AB: 2053 case MATPRODUCT_PtAP: 2054 mat = cusp->mat; 2055 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2056 m = A->rmap->n; 2057 n = B->cmap->n; 2058 break; 2059 case MATPRODUCT_AtB: 2060 if (!A->form_explicit_transpose) { 2061 mat = cusp->mat; 2062 opA = CUSPARSE_OPERATION_TRANSPOSE; 2063 } else { 2064 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2065 mat = cusp->matTranspose; 2066 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2067 } 2068 m = A->cmap->n; 2069 n = B->cmap->n; 2070 break; 2071 case MATPRODUCT_ABt: 2072 case MATPRODUCT_RARt: 2073 mat = cusp->mat; 2074 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2075 m = A->rmap->n; 2076 n = B->rmap->n; 2077 break; 2078 default: 2079 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2080 } 2081 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2082 csrmat = (CsrMatrix*)mat->mat; 2083 /* if the user passed a CPU matrix, copy the data to the GPU */ 2084 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2085 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2086 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2087 2088 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2089 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2090 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2091 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2092 } else { 2093 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2094 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2095 } 2096 2097 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2098 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2099 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2100 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2101 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2102 size_t mmBufferSize; 2103 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2104 if (!mmdata->matBDescr) { 2105 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2106 mmdata->Blda = blda; 2107 } 2108 2109 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2110 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2111 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2112 mmdata->Clda = clda; 2113 } 2114 2115 if (!mat->matDescr) { 2116 stat = cusparseCreateCsr(&mat->matDescr, 2117 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2118 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2119 csrmat->values->data().get(), 2120 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2121 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2122 } 2123 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2124 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2125 mmdata->matCDescr,cusparse_scalartype, 2126 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2127 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2128 cudaError_t cerr; 2129 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2130 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2131 mmdata->mmBufferSize = mmBufferSize; 2132 } 2133 mmdata->initialized = PETSC_TRUE; 2134 } else { 2135 /* to be safe, always update pointers of the mats */ 2136 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2137 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2138 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2139 } 2140 2141 /* do cusparseSpMM, which supports transpose on B */ 2142 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2143 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2144 mmdata->matCDescr,cusparse_scalartype, 2145 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2146 #else 2147 PetscInt k; 2148 /* cusparseXcsrmm does not support transpose on B */ 2149 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2150 cublasHandle_t cublasv2handle; 2151 cublasStatus_t cerr; 2152 2153 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2154 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2155 B->cmap->n,B->rmap->n, 2156 &PETSC_CUSPARSE_ONE ,barray,blda, 2157 &PETSC_CUSPARSE_ZERO,barray,blda, 2158 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2159 blda = B->cmap->n; 2160 k = B->cmap->n; 2161 } else { 2162 k = B->rmap->n; 2163 } 2164 2165 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2166 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2167 csrmat->num_entries,mat->alpha_one,mat->descr, 2168 csrmat->values->data().get(), 2169 csrmat->row_offsets->data().get(), 2170 csrmat->column_indices->data().get(), 2171 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2172 carray,clda);CHKERRCUSPARSE(stat); 2173 #endif 2174 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2175 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2176 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2177 if (product->type == MATPRODUCT_RARt) { 2178 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2179 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2180 } else if (product->type == MATPRODUCT_PtAP) { 2181 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2182 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2183 } else { 2184 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2185 } 2186 if (mmdata->cisdense) { 2187 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2188 } 2189 if (!biscuda) { 2190 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2191 } 2192 PetscFunctionReturn(0); 2193 } 2194 2195 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2196 { 2197 Mat_Product *product = C->product; 2198 Mat A,B; 2199 PetscInt m,n; 2200 PetscBool cisdense,flg; 2201 PetscErrorCode ierr; 2202 MatMatCusparse *mmdata; 2203 Mat_SeqAIJCUSPARSE *cusp; 2204 2205 PetscFunctionBegin; 2206 MatCheckProduct(C,1); 2207 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2208 A = product->A; 2209 B = product->B; 2210 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2211 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2212 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2213 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2214 switch (product->type) { 2215 case MATPRODUCT_AB: 2216 m = A->rmap->n; 2217 n = B->cmap->n; 2218 break; 2219 case MATPRODUCT_AtB: 2220 m = A->cmap->n; 2221 n = B->cmap->n; 2222 break; 2223 case MATPRODUCT_ABt: 2224 m = A->rmap->n; 2225 n = B->rmap->n; 2226 break; 2227 case MATPRODUCT_PtAP: 2228 m = B->cmap->n; 2229 n = B->cmap->n; 2230 break; 2231 case MATPRODUCT_RARt: 2232 m = B->rmap->n; 2233 n = B->rmap->n; 2234 break; 2235 default: 2236 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2237 } 2238 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2239 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2240 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2241 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2242 2243 /* product data */ 2244 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2245 mmdata->cisdense = cisdense; 2246 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2247 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2248 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2249 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2250 } 2251 #endif 2252 /* for these products we need intermediate storage */ 2253 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2254 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2255 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2256 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2257 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2258 } else { 2259 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2260 } 2261 } 2262 C->product->data = mmdata; 2263 C->product->destroy = MatDestroy_MatMatCusparse; 2264 2265 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2266 PetscFunctionReturn(0); 2267 } 2268 2269 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2270 { 2271 Mat_Product *product = C->product; 2272 Mat A,B; 2273 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2274 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2275 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2276 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2277 PetscBool flg; 2278 PetscErrorCode ierr; 2279 cusparseStatus_t stat; 2280 cudaError_t cerr; 2281 MatProductType ptype; 2282 MatMatCusparse *mmdata; 2283 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2284 cusparseSpMatDescr_t BmatSpDescr; 2285 #endif 2286 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2287 2288 PetscFunctionBegin; 2289 MatCheckProduct(C,1); 2290 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2291 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2292 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2293 mmdata = (MatMatCusparse*)C->product->data; 2294 A = product->A; 2295 B = product->B; 2296 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2297 mmdata->reusesym = PETSC_FALSE; 2298 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2299 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2300 Cmat = Ccusp->mat; 2301 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2302 Ccsr = (CsrMatrix*)Cmat->mat; 2303 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2304 goto finalize; 2305 } 2306 if (!c->nz) goto finalize; 2307 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2308 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2309 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2310 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2311 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2312 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2313 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2314 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2315 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2316 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2317 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2318 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2319 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2320 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2321 2322 ptype = product->type; 2323 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2324 ptype = MATPRODUCT_AB; 2325 if (!product->symbolic_used_the_fact_A_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 2326 } 2327 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2328 ptype = MATPRODUCT_AB; 2329 if (!product->symbolic_used_the_fact_B_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 2330 } 2331 switch (ptype) { 2332 case MATPRODUCT_AB: 2333 Amat = Acusp->mat; 2334 Bmat = Bcusp->mat; 2335 break; 2336 case MATPRODUCT_AtB: 2337 Amat = Acusp->matTranspose; 2338 Bmat = Bcusp->mat; 2339 break; 2340 case MATPRODUCT_ABt: 2341 Amat = Acusp->mat; 2342 Bmat = Bcusp->matTranspose; 2343 break; 2344 default: 2345 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2346 } 2347 Cmat = Ccusp->mat; 2348 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2349 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2350 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2351 Acsr = (CsrMatrix*)Amat->mat; 2352 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2353 Ccsr = (CsrMatrix*)Cmat->mat; 2354 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2355 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2356 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2357 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2358 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2359 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2360 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2361 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2362 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2363 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2364 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2365 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2366 #else 2367 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2368 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2369 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2370 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2371 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2372 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2373 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2374 #endif 2375 #else 2376 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2377 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2378 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2379 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2380 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2381 #endif 2382 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2383 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2384 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2385 C->offloadmask = PETSC_OFFLOAD_GPU; 2386 finalize: 2387 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2388 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2389 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2390 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2391 c->reallocs = 0; 2392 C->info.mallocs += 0; 2393 C->info.nz_unneeded = 0; 2394 C->assembled = C->was_assembled = PETSC_TRUE; 2395 C->num_ass++; 2396 PetscFunctionReturn(0); 2397 } 2398 2399 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2400 { 2401 Mat_Product *product = C->product; 2402 Mat A,B; 2403 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2404 Mat_SeqAIJ *a,*b,*c; 2405 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2406 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2407 PetscInt i,j,m,n,k; 2408 PetscBool flg; 2409 PetscErrorCode ierr; 2410 cusparseStatus_t stat; 2411 cudaError_t cerr; 2412 MatProductType ptype; 2413 MatMatCusparse *mmdata; 2414 PetscLogDouble flops; 2415 PetscBool biscompressed,ciscompressed; 2416 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2417 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2418 cusparseSpMatDescr_t BmatSpDescr; 2419 #else 2420 int cnz; 2421 #endif 2422 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2423 2424 PetscFunctionBegin; 2425 MatCheckProduct(C,1); 2426 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2427 A = product->A; 2428 B = product->B; 2429 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2430 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2431 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2432 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2433 a = (Mat_SeqAIJ*)A->data; 2434 b = (Mat_SeqAIJ*)B->data; 2435 /* product data */ 2436 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2437 C->product->data = mmdata; 2438 C->product->destroy = MatDestroy_MatMatCusparse; 2439 2440 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2441 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2442 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2443 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2444 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2445 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2446 2447 ptype = product->type; 2448 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2449 ptype = MATPRODUCT_AB; 2450 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2451 } 2452 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2453 ptype = MATPRODUCT_AB; 2454 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2455 } 2456 biscompressed = PETSC_FALSE; 2457 ciscompressed = PETSC_FALSE; 2458 switch (ptype) { 2459 case MATPRODUCT_AB: 2460 m = A->rmap->n; 2461 n = B->cmap->n; 2462 k = A->cmap->n; 2463 Amat = Acusp->mat; 2464 Bmat = Bcusp->mat; 2465 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2466 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2467 break; 2468 case MATPRODUCT_AtB: 2469 m = A->cmap->n; 2470 n = B->cmap->n; 2471 k = A->rmap->n; 2472 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2473 Amat = Acusp->matTranspose; 2474 Bmat = Bcusp->mat; 2475 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2476 break; 2477 case MATPRODUCT_ABt: 2478 m = A->rmap->n; 2479 n = B->rmap->n; 2480 k = A->cmap->n; 2481 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 2482 Amat = Acusp->mat; 2483 Bmat = Bcusp->matTranspose; 2484 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2485 break; 2486 default: 2487 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2488 } 2489 2490 /* create cusparse matrix */ 2491 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2492 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2493 c = (Mat_SeqAIJ*)C->data; 2494 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2495 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2496 Ccsr = new CsrMatrix; 2497 2498 c->compressedrow.use = ciscompressed; 2499 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2500 c->compressedrow.nrows = a->compressedrow.nrows; 2501 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2502 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2503 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2504 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2505 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2506 } else { 2507 c->compressedrow.nrows = 0; 2508 c->compressedrow.i = NULL; 2509 c->compressedrow.rindex = NULL; 2510 Ccusp->workVector = NULL; 2511 Cmat->cprowIndices = NULL; 2512 } 2513 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2514 Ccusp->mat = Cmat; 2515 Ccusp->mat->mat = Ccsr; 2516 Ccsr->num_rows = Ccusp->nrows; 2517 Ccsr->num_cols = n; 2518 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2519 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2520 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2521 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2522 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2523 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2524 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2525 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2526 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2527 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2528 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2529 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2530 c->nz = 0; 2531 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2532 Ccsr->values = new THRUSTARRAY(c->nz); 2533 goto finalizesym; 2534 } 2535 2536 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2537 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2538 Acsr = (CsrMatrix*)Amat->mat; 2539 if (!biscompressed) { 2540 Bcsr = (CsrMatrix*)Bmat->mat; 2541 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2542 BmatSpDescr = Bmat->matDescr; 2543 #endif 2544 } else { /* we need to use row offsets for the full matrix */ 2545 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2546 Bcsr = new CsrMatrix; 2547 Bcsr->num_rows = B->rmap->n; 2548 Bcsr->num_cols = cBcsr->num_cols; 2549 Bcsr->num_entries = cBcsr->num_entries; 2550 Bcsr->column_indices = cBcsr->column_indices; 2551 Bcsr->values = cBcsr->values; 2552 if (!Bcusp->rowoffsets_gpu) { 2553 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2554 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2555 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2556 } 2557 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2558 mmdata->Bcsr = Bcsr; 2559 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2560 if (Bcsr->num_rows && Bcsr->num_cols) { 2561 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2562 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2563 Bcsr->values->data().get(), 2564 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2565 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2566 } 2567 BmatSpDescr = mmdata->matSpBDescr; 2568 #endif 2569 } 2570 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2571 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2572 /* precompute flops count */ 2573 if (ptype == MATPRODUCT_AB) { 2574 for (i=0, flops = 0; i<A->rmap->n; i++) { 2575 const PetscInt st = a->i[i]; 2576 const PetscInt en = a->i[i+1]; 2577 for (j=st; j<en; j++) { 2578 const PetscInt brow = a->j[j]; 2579 flops += 2.*(b->i[brow+1] - b->i[brow]); 2580 } 2581 } 2582 } else if (ptype == MATPRODUCT_AtB) { 2583 for (i=0, flops = 0; i<A->rmap->n; i++) { 2584 const PetscInt anzi = a->i[i+1] - a->i[i]; 2585 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2586 flops += (2.*anzi)*bnzi; 2587 } 2588 } else { /* TODO */ 2589 flops = 0.; 2590 } 2591 2592 mmdata->flops = flops; 2593 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2594 2595 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2596 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2597 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2598 NULL, NULL, NULL, 2599 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2600 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2601 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2602 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2603 { 2604 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2605 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2606 */ 2607 void* dBuffer1 = NULL; 2608 void* dBuffer2 = NULL; 2609 void* dBuffer3 = NULL; 2610 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2611 size_t bufferSize1 = 0; 2612 size_t bufferSize2 = 0; 2613 size_t bufferSize3 = 0; 2614 size_t bufferSize4 = 0; 2615 size_t bufferSize5 = 0; 2616 2617 /*----------------------------------------------------------------------*/ 2618 /* ask bufferSize1 bytes for external memory */ 2619 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2620 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2621 &bufferSize1, NULL);CHKERRCUSPARSE(stat); 2622 cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr); 2623 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2624 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2625 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2626 &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat); 2627 2628 /*----------------------------------------------------------------------*/ 2629 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2630 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2631 &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat); 2632 cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr); 2633 cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr); 2634 cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr); 2635 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2636 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2637 &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat); 2638 cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr); 2639 cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr); 2640 2641 /*----------------------------------------------------------------------*/ 2642 /* get matrix C non-zero entries C_nnz1 */ 2643 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2644 c->nz = (PetscInt) C_nnz1; 2645 /* allocate matrix C */ 2646 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2647 Ccsr->values = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2648 /* update matC with the new pointers */ 2649 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2650 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2651 2652 /*----------------------------------------------------------------------*/ 2653 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2654 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2655 &bufferSize5, NULL);CHKERRCUSPARSE(stat); 2656 cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr); 2657 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2658 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2659 &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat); 2660 cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr); 2661 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2662 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2663 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2664 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2665 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr); 2666 } 2667 #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2668 size_t bufSize2; 2669 /* ask bufferSize bytes for external memory */ 2670 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2671 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2672 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2673 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2674 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2675 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2676 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2677 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2678 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2679 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2680 /* ask bufferSize again bytes for external memory */ 2681 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2682 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2683 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2684 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2685 /* The CUSPARSE documentation is not clear, nor the API 2686 We need both buffers to perform the operations properly! 2687 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2688 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2689 is stored in the descriptor! What a messy API... */ 2690 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2691 /* compute the intermediate product of A * B */ 2692 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2693 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2694 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2695 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2696 /* get matrix C non-zero entries C_nnz1 */ 2697 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2698 c->nz = (PetscInt) C_nnz1; 2699 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2700 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2701 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2702 Ccsr->values = new THRUSTARRAY(c->nz); 2703 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2704 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2705 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2706 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2707 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2708 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2709 #endif 2710 #else 2711 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2712 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2713 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2714 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2715 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2716 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2717 c->nz = cnz; 2718 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2719 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2720 Ccsr->values = new THRUSTARRAY(c->nz); 2721 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2722 2723 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2724 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2725 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2726 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2727 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2728 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2729 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2730 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2731 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2732 #endif 2733 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2734 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2735 finalizesym: 2736 c->singlemalloc = PETSC_FALSE; 2737 c->free_a = PETSC_TRUE; 2738 c->free_ij = PETSC_TRUE; 2739 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2740 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2741 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2742 PetscInt *d_i = c->i; 2743 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2744 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2745 ii = *Ccsr->row_offsets; 2746 jj = *Ccsr->column_indices; 2747 if (ciscompressed) d_i = c->compressedrow.i; 2748 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2749 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2750 } else { 2751 PetscInt *d_i = c->i; 2752 if (ciscompressed) d_i = c->compressedrow.i; 2753 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2754 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2755 } 2756 if (ciscompressed) { /* need to expand host row offsets */ 2757 PetscInt r = 0; 2758 c->i[0] = 0; 2759 for (k = 0; k < c->compressedrow.nrows; k++) { 2760 const PetscInt next = c->compressedrow.rindex[k]; 2761 const PetscInt old = c->compressedrow.i[k]; 2762 for (; r < next; r++) c->i[r+1] = old; 2763 } 2764 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2765 } 2766 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2767 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2768 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2769 c->maxnz = c->nz; 2770 c->nonzerorowcnt = 0; 2771 c->rmax = 0; 2772 for (k = 0; k < m; k++) { 2773 const PetscInt nn = c->i[k+1] - c->i[k]; 2774 c->ilen[k] = c->imax[k] = nn; 2775 c->nonzerorowcnt += (PetscInt)!!nn; 2776 c->rmax = PetscMax(c->rmax,nn); 2777 } 2778 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2779 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2780 Ccsr->num_entries = c->nz; 2781 2782 C->nonzerostate++; 2783 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2784 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2785 Ccusp->nonzerostate = C->nonzerostate; 2786 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2787 C->preallocated = PETSC_TRUE; 2788 C->assembled = PETSC_FALSE; 2789 C->was_assembled = PETSC_FALSE; 2790 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2791 mmdata->reusesym = PETSC_TRUE; 2792 C->offloadmask = PETSC_OFFLOAD_GPU; 2793 } 2794 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2795 PetscFunctionReturn(0); 2796 } 2797 2798 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2799 2800 /* handles sparse or dense B */ 2801 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2802 { 2803 Mat_Product *product = mat->product; 2804 PetscErrorCode ierr; 2805 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2806 2807 PetscFunctionBegin; 2808 MatCheckProduct(mat,1); 2809 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2810 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2811 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2812 } 2813 if (product->type == MATPRODUCT_ABC) { 2814 Ciscusp = PETSC_FALSE; 2815 if (!product->C->boundtocpu) { 2816 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2817 } 2818 } 2819 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 2820 PetscBool usecpu = PETSC_FALSE; 2821 switch (product->type) { 2822 case MATPRODUCT_AB: 2823 if (product->api_user) { 2824 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 2825 ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2826 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2827 } else { 2828 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 2829 ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2830 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2831 } 2832 break; 2833 case MATPRODUCT_AtB: 2834 if (product->api_user) { 2835 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 2836 ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2837 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2838 } else { 2839 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 2840 ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2841 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2842 } 2843 break; 2844 case MATPRODUCT_PtAP: 2845 if (product->api_user) { 2846 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 2847 ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2848 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2849 } else { 2850 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 2851 ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2852 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2853 } 2854 break; 2855 case MATPRODUCT_RARt: 2856 if (product->api_user) { 2857 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 2858 ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2859 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2860 } else { 2861 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 2862 ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2863 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2864 } 2865 break; 2866 case MATPRODUCT_ABC: 2867 if (product->api_user) { 2868 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 2869 ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2870 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2871 } else { 2872 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 2873 ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2874 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2875 } 2876 break; 2877 default: 2878 break; 2879 } 2880 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 2881 } 2882 /* dispatch */ 2883 if (isdense) { 2884 switch (product->type) { 2885 case MATPRODUCT_AB: 2886 case MATPRODUCT_AtB: 2887 case MATPRODUCT_ABt: 2888 case MATPRODUCT_PtAP: 2889 case MATPRODUCT_RARt: 2890 if (product->A->boundtocpu) { 2891 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2892 } else { 2893 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2894 } 2895 break; 2896 case MATPRODUCT_ABC: 2897 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2898 break; 2899 default: 2900 break; 2901 } 2902 } else if (Biscusp && Ciscusp) { 2903 switch (product->type) { 2904 case MATPRODUCT_AB: 2905 case MATPRODUCT_AtB: 2906 case MATPRODUCT_ABt: 2907 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2908 break; 2909 case MATPRODUCT_PtAP: 2910 case MATPRODUCT_RARt: 2911 case MATPRODUCT_ABC: 2912 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2913 break; 2914 default: 2915 break; 2916 } 2917 } else { /* fallback for AIJ */ 2918 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2919 } 2920 PetscFunctionReturn(0); 2921 } 2922 2923 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2924 { 2925 PetscErrorCode ierr; 2926 2927 PetscFunctionBegin; 2928 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2929 PetscFunctionReturn(0); 2930 } 2931 2932 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2933 { 2934 PetscErrorCode ierr; 2935 2936 PetscFunctionBegin; 2937 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2938 PetscFunctionReturn(0); 2939 } 2940 2941 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2942 { 2943 PetscErrorCode ierr; 2944 2945 PetscFunctionBegin; 2946 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2947 PetscFunctionReturn(0); 2948 } 2949 2950 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2951 { 2952 PetscErrorCode ierr; 2953 2954 PetscFunctionBegin; 2955 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2956 PetscFunctionReturn(0); 2957 } 2958 2959 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2960 { 2961 PetscErrorCode ierr; 2962 2963 PetscFunctionBegin; 2964 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2965 PetscFunctionReturn(0); 2966 } 2967 2968 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2969 { 2970 int i = blockIdx.x*blockDim.x + threadIdx.x; 2971 if (i < n) y[idx[i]] += x[i]; 2972 } 2973 2974 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2975 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2976 { 2977 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2978 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2979 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2980 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2981 PetscErrorCode ierr; 2982 cusparseStatus_t stat; 2983 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2984 PetscBool compressed; 2985 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2986 PetscInt nx,ny; 2987 #endif 2988 2989 PetscFunctionBegin; 2990 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 2991 if (!a->nonzerorowcnt) { 2992 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2993 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2994 PetscFunctionReturn(0); 2995 } 2996 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2997 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2998 if (!trans) { 2999 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3000 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3001 } else { 3002 if (herm || !A->form_explicit_transpose) { 3003 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3004 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3005 } else { 3006 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);} 3007 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3008 } 3009 } 3010 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3011 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3012 3013 try { 3014 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3015 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 3016 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 3017 3018 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3019 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3020 /* z = A x + beta y. 3021 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3022 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3023 */ 3024 xptr = xarray; 3025 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3026 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3027 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3028 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3029 allocated to accommodate different uses. So we get the length info directly from mat. 3030 */ 3031 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3032 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3033 nx = mat->num_cols; 3034 ny = mat->num_rows; 3035 } 3036 #endif 3037 } else { 3038 /* z = A^T x + beta y 3039 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3040 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3041 */ 3042 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3043 dptr = zarray; 3044 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3045 if (compressed) { /* Scatter x to work vector */ 3046 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3047 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3048 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3049 VecCUDAEqualsReverse()); 3050 } 3051 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3052 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3053 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3054 nx = mat->num_rows; 3055 ny = mat->num_cols; 3056 } 3057 #endif 3058 } 3059 3060 /* csr_spmv does y = alpha op(A) x + beta y */ 3061 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3062 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3063 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3064 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3065 cudaError_t cerr; 3066 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3067 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3068 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3069 matstruct->matDescr, 3070 matstruct->cuSpMV[opA].vecXDescr, beta, 3071 matstruct->cuSpMV[opA].vecYDescr, 3072 cusparse_scalartype, 3073 cusparsestruct->spmvAlg, 3074 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 3075 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 3076 3077 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3078 } else { 3079 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3080 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 3081 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 3082 } 3083 3084 stat = cusparseSpMV(cusparsestruct->handle, opA, 3085 matstruct->alpha_one, 3086 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3087 matstruct->cuSpMV[opA].vecXDescr, 3088 beta, 3089 matstruct->cuSpMV[opA].vecYDescr, 3090 cusparse_scalartype, 3091 cusparsestruct->spmvAlg, 3092 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 3093 #else 3094 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3095 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 3096 mat->num_rows, mat->num_cols, 3097 mat->num_entries, matstruct->alpha_one, matstruct->descr, 3098 mat->values->data().get(), mat->row_offsets->data().get(), 3099 mat->column_indices->data().get(), xptr, beta, 3100 dptr);CHKERRCUSPARSE(stat); 3101 #endif 3102 } else { 3103 if (cusparsestruct->nrows) { 3104 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3105 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3106 #else 3107 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3108 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 3109 matstruct->alpha_one, matstruct->descr, hybMat, 3110 xptr, beta, 3111 dptr);CHKERRCUSPARSE(stat); 3112 #endif 3113 } 3114 } 3115 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3116 3117 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3118 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3119 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3120 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 3121 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3122 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3123 } 3124 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3125 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 3126 } 3127 3128 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3129 if (compressed) { 3130 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3131 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3132 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3133 prevent that. So I just add a ScatterAdd kernel. 3134 */ 3135 #if 0 3136 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3137 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3138 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3139 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3140 VecCUDAPlusEquals()); 3141 #else 3142 PetscInt n = matstruct->cprowIndices->size(); 3143 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3144 #endif 3145 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3146 } 3147 } else { 3148 if (yy && yy != zz) { 3149 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3150 } 3151 } 3152 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3153 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 3154 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 3155 } catch(char *ex) { 3156 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3157 } 3158 if (yy) { 3159 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 3160 } else { 3161 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 3162 } 3163 PetscFunctionReturn(0); 3164 } 3165 3166 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3167 { 3168 PetscErrorCode ierr; 3169 3170 PetscFunctionBegin; 3171 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3172 PetscFunctionReturn(0); 3173 } 3174 3175 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3176 { 3177 PetscErrorCode ierr; 3178 PetscObjectState onnz = A->nonzerostate; 3179 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3180 3181 PetscFunctionBegin; 3182 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 3183 if (onnz != A->nonzerostate && cusp->deviceMat) { 3184 cudaError_t cerr; 3185 3186 ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 3187 cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr); 3188 cusp->deviceMat = NULL; 3189 } 3190 PetscFunctionReturn(0); 3191 } 3192 3193 /* --------------------------------------------------------------------------------*/ 3194 /*@ 3195 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3196 (the default parallel PETSc format). This matrix will ultimately pushed down 3197 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3198 assembly performance the user should preallocate the matrix storage by setting 3199 the parameter nz (or the array nnz). By setting these parameters accurately, 3200 performance during matrix assembly can be increased by more than a factor of 50. 3201 3202 Collective 3203 3204 Input Parameters: 3205 + comm - MPI communicator, set to PETSC_COMM_SELF 3206 . m - number of rows 3207 . n - number of columns 3208 . nz - number of nonzeros per row (same for all rows) 3209 - nnz - array containing the number of nonzeros in the various rows 3210 (possibly different for each row) or NULL 3211 3212 Output Parameter: 3213 . A - the matrix 3214 3215 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3216 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3217 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3218 3219 Notes: 3220 If nnz is given then nz is ignored 3221 3222 The AIJ format (also called the Yale sparse matrix format or 3223 compressed row storage), is fully compatible with standard Fortran 77 3224 storage. That is, the stored row and column indices can begin at 3225 either one (as in Fortran) or zero. See the users' manual for details. 3226 3227 Specify the preallocated storage with either nz or nnz (not both). 3228 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3229 allocation. For large problems you MUST preallocate memory or you 3230 will get TERRIBLE performance, see the users' manual chapter on matrices. 3231 3232 By default, this format uses inodes (identical nodes) when possible, to 3233 improve numerical efficiency of matrix-vector products and solves. We 3234 search for consecutive rows with the same nonzero structure, thereby 3235 reusing matrix information to achieve increased efficiency. 3236 3237 Level: intermediate 3238 3239 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3240 @*/ 3241 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3242 { 3243 PetscErrorCode ierr; 3244 3245 PetscFunctionBegin; 3246 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3247 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3248 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3249 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3250 PetscFunctionReturn(0); 3251 } 3252 3253 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3254 { 3255 PetscErrorCode ierr; 3256 3257 PetscFunctionBegin; 3258 if (A->factortype == MAT_FACTOR_NONE) { 3259 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3260 } else { 3261 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3262 } 3263 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3264 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3265 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3266 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3267 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3268 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3269 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3270 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3271 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 3272 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3273 PetscFunctionReturn(0); 3274 } 3275 3276 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3277 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3278 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3279 { 3280 PetscErrorCode ierr; 3281 3282 PetscFunctionBegin; 3283 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3284 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3285 PetscFunctionReturn(0); 3286 } 3287 3288 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3289 { 3290 PetscErrorCode ierr; 3291 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3292 Mat_SeqAIJCUSPARSE *cy; 3293 Mat_SeqAIJCUSPARSE *cx; 3294 PetscScalar *ay; 3295 const PetscScalar *ax; 3296 CsrMatrix *csry,*csrx; 3297 3298 PetscFunctionBegin; 3299 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3300 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3301 if (X->ops->axpy != Y->ops->axpy) { 3302 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3303 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3304 PetscFunctionReturn(0); 3305 } 3306 /* if we are here, it means both matrices are bound to GPU */ 3307 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3308 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3309 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3310 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3311 csry = (CsrMatrix*)cy->mat->mat; 3312 csrx = (CsrMatrix*)cx->mat->mat; 3313 /* see if we can turn this into a cublas axpy */ 3314 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3315 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3316 if (eq) { 3317 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3318 } 3319 if (eq) str = SAME_NONZERO_PATTERN; 3320 } 3321 /* spgeam is buggy with one column */ 3322 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3323 3324 if (str == SUBSET_NONZERO_PATTERN) { 3325 cusparseStatus_t stat; 3326 PetscScalar b = 1.0; 3327 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3328 size_t bufferSize; 3329 void *buffer; 3330 cudaError_t cerr; 3331 #endif 3332 3333 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3334 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3335 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3336 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3337 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3338 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3339 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3340 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3341 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3342 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3343 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3344 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3345 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3346 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3347 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3348 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3349 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3350 #else 3351 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3352 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3353 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3354 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3355 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3356 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3357 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3358 #endif 3359 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3360 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3361 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3362 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3363 } else if (str == SAME_NONZERO_PATTERN) { 3364 cublasHandle_t cublasv2handle; 3365 cublasStatus_t berr; 3366 PetscBLASInt one = 1, bnz = 1; 3367 3368 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3369 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3370 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3371 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3372 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3373 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3374 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3375 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3376 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3377 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3378 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3379 } else { 3380 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3381 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3382 } 3383 PetscFunctionReturn(0); 3384 } 3385 3386 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3387 { 3388 PetscErrorCode ierr; 3389 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3390 PetscScalar *ay; 3391 cublasHandle_t cublasv2handle; 3392 cublasStatus_t berr; 3393 PetscBLASInt one = 1, bnz = 1; 3394 3395 PetscFunctionBegin; 3396 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3397 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3398 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3399 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3400 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3401 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3402 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3403 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3404 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3405 PetscFunctionReturn(0); 3406 } 3407 3408 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3409 { 3410 PetscErrorCode ierr; 3411 PetscBool both = PETSC_FALSE; 3412 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3413 3414 PetscFunctionBegin; 3415 if (A->factortype == MAT_FACTOR_NONE) { 3416 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3417 if (spptr->mat) { 3418 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3419 if (matrix->values) { 3420 both = PETSC_TRUE; 3421 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3422 } 3423 } 3424 if (spptr->matTranspose) { 3425 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3426 if (matrix->values) { 3427 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3428 } 3429 } 3430 } 3431 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3432 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3433 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3434 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3435 else A->offloadmask = PETSC_OFFLOAD_CPU; 3436 PetscFunctionReturn(0); 3437 } 3438 3439 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3440 { 3441 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3442 PetscErrorCode ierr; 3443 3444 PetscFunctionBegin; 3445 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3446 if (flg) { 3447 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3448 3449 A->ops->scale = MatScale_SeqAIJ; 3450 A->ops->axpy = MatAXPY_SeqAIJ; 3451 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3452 A->ops->mult = MatMult_SeqAIJ; 3453 A->ops->multadd = MatMultAdd_SeqAIJ; 3454 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3455 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3456 A->ops->multhermitiantranspose = NULL; 3457 A->ops->multhermitiantransposeadd = NULL; 3458 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3459 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3460 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3461 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3462 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3463 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3464 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3465 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3466 } else { 3467 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3468 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3469 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3470 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3471 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3472 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3473 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3474 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3475 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3476 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3477 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3478 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3479 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3480 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3481 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3482 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3483 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3484 } 3485 A->boundtocpu = flg; 3486 if (flg && a->inode.size) { 3487 a->inode.use = PETSC_TRUE; 3488 } else { 3489 a->inode.use = PETSC_FALSE; 3490 } 3491 PetscFunctionReturn(0); 3492 } 3493 3494 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3495 { 3496 PetscErrorCode ierr; 3497 cusparseStatus_t stat; 3498 Mat B; 3499 3500 PetscFunctionBegin; 3501 ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3502 if (reuse == MAT_INITIAL_MATRIX) { 3503 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3504 } else if (reuse == MAT_REUSE_MATRIX) { 3505 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3506 } 3507 B = *newmat; 3508 3509 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3510 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3511 3512 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3513 if (B->factortype == MAT_FACTOR_NONE) { 3514 Mat_SeqAIJCUSPARSE *spptr; 3515 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3516 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3517 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3518 spptr->format = MAT_CUSPARSE_CSR; 3519 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3520 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3521 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3522 #else 3523 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3524 #endif 3525 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3526 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3527 #endif 3528 B->spptr = spptr; 3529 } else { 3530 Mat_SeqAIJCUSPARSETriFactors *spptr; 3531 3532 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3533 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3534 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3535 B->spptr = spptr; 3536 } 3537 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3538 } 3539 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3540 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3541 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3542 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3543 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3544 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3545 3546 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3547 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3548 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3549 #if defined(PETSC_HAVE_HYPRE) 3550 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 3551 #endif 3552 PetscFunctionReturn(0); 3553 } 3554 3555 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3556 { 3557 PetscErrorCode ierr; 3558 3559 PetscFunctionBegin; 3560 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3561 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3562 PetscFunctionReturn(0); 3563 } 3564 3565 /*MC 3566 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3567 3568 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3569 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3570 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3571 3572 Options Database Keys: 3573 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3574 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3575 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3576 3577 Level: beginner 3578 3579 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3580 M*/ 3581 3582 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3583 3584 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3585 { 3586 PetscErrorCode ierr; 3587 3588 PetscFunctionBegin; 3589 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 3590 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3591 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3592 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3593 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3594 3595 PetscFunctionReturn(0); 3596 } 3597 3598 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3599 { 3600 PetscErrorCode ierr; 3601 cusparseStatus_t stat; 3602 3603 PetscFunctionBegin; 3604 if (*cusparsestruct) { 3605 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3606 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3607 delete (*cusparsestruct)->workVector; 3608 delete (*cusparsestruct)->rowoffsets_gpu; 3609 delete (*cusparsestruct)->cooPerm; 3610 delete (*cusparsestruct)->cooPerm_a; 3611 delete (*cusparsestruct)->csr2csc_i; 3612 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3613 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3614 } 3615 PetscFunctionReturn(0); 3616 } 3617 3618 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3619 { 3620 PetscFunctionBegin; 3621 if (*mat) { 3622 delete (*mat)->values; 3623 delete (*mat)->column_indices; 3624 delete (*mat)->row_offsets; 3625 delete *mat; 3626 *mat = 0; 3627 } 3628 PetscFunctionReturn(0); 3629 } 3630 3631 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3632 { 3633 cusparseStatus_t stat; 3634 PetscErrorCode ierr; 3635 3636 PetscFunctionBegin; 3637 if (*trifactor) { 3638 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3639 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3640 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3641 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3642 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3643 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3644 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3645 #endif 3646 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3647 } 3648 PetscFunctionReturn(0); 3649 } 3650 3651 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3652 { 3653 CsrMatrix *mat; 3654 cusparseStatus_t stat; 3655 cudaError_t err; 3656 3657 PetscFunctionBegin; 3658 if (*matstruct) { 3659 if ((*matstruct)->mat) { 3660 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3661 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3662 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3663 #else 3664 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3665 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3666 #endif 3667 } else { 3668 mat = (CsrMatrix*)(*matstruct)->mat; 3669 CsrMatrix_Destroy(&mat); 3670 } 3671 } 3672 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3673 delete (*matstruct)->cprowIndices; 3674 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3675 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3676 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3677 3678 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3679 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3680 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3681 for (int i=0; i<3; i++) { 3682 if (mdata->cuSpMV[i].initialized) { 3683 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3684 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3685 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3686 } 3687 } 3688 #endif 3689 delete *matstruct; 3690 *matstruct = NULL; 3691 } 3692 PetscFunctionReturn(0); 3693 } 3694 3695 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3696 { 3697 PetscErrorCode ierr; 3698 3699 PetscFunctionBegin; 3700 if (*trifactors) { 3701 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3702 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3703 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3704 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3705 delete (*trifactors)->rpermIndices; 3706 delete (*trifactors)->cpermIndices; 3707 delete (*trifactors)->workVector; 3708 (*trifactors)->rpermIndices = NULL; 3709 (*trifactors)->cpermIndices = NULL; 3710 (*trifactors)->workVector = NULL; 3711 if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3712 if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3713 (*trifactors)->init_dev_prop = PETSC_FALSE; 3714 } 3715 PetscFunctionReturn(0); 3716 } 3717 3718 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3719 { 3720 PetscErrorCode ierr; 3721 cusparseHandle_t handle; 3722 cusparseStatus_t stat; 3723 3724 PetscFunctionBegin; 3725 if (*trifactors) { 3726 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3727 if (handle = (*trifactors)->handle) { 3728 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3729 } 3730 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3731 } 3732 PetscFunctionReturn(0); 3733 } 3734 3735 struct IJCompare 3736 { 3737 __host__ __device__ 3738 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3739 { 3740 if (t1.get<0>() < t2.get<0>()) return true; 3741 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3742 return false; 3743 } 3744 }; 3745 3746 struct IJEqual 3747 { 3748 __host__ __device__ 3749 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3750 { 3751 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3752 return true; 3753 } 3754 }; 3755 3756 struct IJDiff 3757 { 3758 __host__ __device__ 3759 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3760 { 3761 return t1 == t2 ? 0 : 1; 3762 } 3763 }; 3764 3765 struct IJSum 3766 { 3767 __host__ __device__ 3768 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3769 { 3770 return t1||t2; 3771 } 3772 }; 3773 3774 #include <thrust/iterator/discard_iterator.h> 3775 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3776 { 3777 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3778 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3779 THRUSTARRAY *cooPerm_v = NULL; 3780 thrust::device_ptr<const PetscScalar> d_v; 3781 CsrMatrix *matrix; 3782 PetscErrorCode ierr; 3783 PetscInt n; 3784 3785 PetscFunctionBegin; 3786 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3787 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3788 if (!cusp->cooPerm) { 3789 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3790 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3791 PetscFunctionReturn(0); 3792 } 3793 matrix = (CsrMatrix*)cusp->mat->mat; 3794 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3795 if (!v) { 3796 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3797 goto finalize; 3798 } 3799 n = cusp->cooPerm->size(); 3800 if (isCudaMem(v)) { 3801 d_v = thrust::device_pointer_cast(v); 3802 } else { 3803 cooPerm_v = new THRUSTARRAY(n); 3804 cooPerm_v->assign(v,v+n); 3805 d_v = cooPerm_v->data(); 3806 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3807 } 3808 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3809 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3810 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3811 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3812 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3813 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3814 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3815 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3816 */ 3817 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3818 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3819 delete cooPerm_w; 3820 } else { 3821 /* all nonzeros in d_v[] are unique entries */ 3822 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3823 matrix->values->begin())); 3824 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3825 matrix->values->end())); 3826 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 3827 } 3828 } else { 3829 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3830 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3831 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3832 } else { 3833 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3834 matrix->values->begin())); 3835 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3836 matrix->values->end())); 3837 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3838 } 3839 } 3840 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3841 finalize: 3842 delete cooPerm_v; 3843 A->offloadmask = PETSC_OFFLOAD_GPU; 3844 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3845 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3846 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3847 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3848 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3849 a->reallocs = 0; 3850 A->info.mallocs += 0; 3851 A->info.nz_unneeded = 0; 3852 A->assembled = A->was_assembled = PETSC_TRUE; 3853 A->num_ass++; 3854 PetscFunctionReturn(0); 3855 } 3856 3857 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3858 { 3859 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3860 PetscErrorCode ierr; 3861 3862 PetscFunctionBegin; 3863 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3864 if (!cusp) PetscFunctionReturn(0); 3865 if (destroy) { 3866 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3867 delete cusp->csr2csc_i; 3868 cusp->csr2csc_i = NULL; 3869 } 3870 A->transupdated = PETSC_FALSE; 3871 PetscFunctionReturn(0); 3872 } 3873 3874 #include <thrust/binary_search.h> 3875 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3876 { 3877 PetscErrorCode ierr; 3878 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3879 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3880 PetscInt cooPerm_n, nzr = 0; 3881 cudaError_t cerr; 3882 3883 PetscFunctionBegin; 3884 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3885 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3886 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3887 if (n != cooPerm_n) { 3888 delete cusp->cooPerm; 3889 delete cusp->cooPerm_a; 3890 cusp->cooPerm = NULL; 3891 cusp->cooPerm_a = NULL; 3892 } 3893 if (n) { 3894 THRUSTINTARRAY d_i(n); 3895 THRUSTINTARRAY d_j(n); 3896 THRUSTINTARRAY ii(A->rmap->n); 3897 3898 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3899 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3900 3901 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3902 d_i.assign(coo_i,coo_i+n); 3903 d_j.assign(coo_j,coo_j+n); 3904 3905 /* Ex. 3906 n = 6 3907 coo_i = [3,3,1,4,1,4] 3908 coo_j = [3,2,2,5,2,6] 3909 */ 3910 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3911 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3912 3913 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3914 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3915 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 3916 *cusp->cooPerm_a = d_i; /* copy the sorted array */ 3917 THRUSTINTARRAY w = d_j; 3918 3919 /* 3920 d_i = [1,1,3,3,4,4] 3921 d_j = [2,2,2,3,5,6] 3922 cooPerm = [2,4,1,0,3,5] 3923 */ 3924 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 3925 3926 /* 3927 d_i = [1,3,3,4,4,x] 3928 ^ekey 3929 d_j = [2,2,3,5,6,x] 3930 ^nekye 3931 */ 3932 if (nekey == ekey) { /* all entries are unique */ 3933 delete cusp->cooPerm_a; 3934 cusp->cooPerm_a = NULL; 3935 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 3936 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 3937 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 3938 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 3939 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 3940 w[0] = 0; 3941 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 3942 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 3943 } 3944 thrust::counting_iterator<PetscInt> search_begin(0); 3945 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 3946 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 3947 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 3948 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3949 3950 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3951 a->singlemalloc = PETSC_FALSE; 3952 a->free_a = PETSC_TRUE; 3953 a->free_ij = PETSC_TRUE; 3954 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3955 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 3956 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3957 a->nz = a->maxnz = a->i[A->rmap->n]; 3958 a->rmax = 0; 3959 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3960 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3961 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3962 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3963 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3964 for (PetscInt i = 0; i < A->rmap->n; i++) { 3965 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3966 nzr += (PetscInt)!!(nnzr); 3967 a->ilen[i] = a->imax[i] = nnzr; 3968 a->rmax = PetscMax(a->rmax,nnzr); 3969 } 3970 a->nonzerorowcnt = nzr; 3971 A->preallocated = PETSC_TRUE; 3972 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3973 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3974 } else { 3975 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3976 } 3977 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3978 3979 /* We want to allocate the CUSPARSE struct for matvec now. 3980 The code is so convoluted now that I prefer to copy zeros */ 3981 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3982 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3983 A->offloadmask = PETSC_OFFLOAD_CPU; 3984 A->nonzerostate++; 3985 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3986 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 3987 3988 A->assembled = PETSC_FALSE; 3989 A->was_assembled = PETSC_FALSE; 3990 PetscFunctionReturn(0); 3991 } 3992 3993 /*@C 3994 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 3995 3996 Not collective 3997 3998 Input Parameters: 3999 + A - the matrix 4000 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4001 4002 Output Parameters: 4003 + ia - the CSR row pointers 4004 - ja - the CSR column indices 4005 4006 Level: developer 4007 4008 Notes: 4009 When compressed is true, the CSR structure does not contain empty rows 4010 4011 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead() 4012 @*/ 4013 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4014 { 4015 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4016 CsrMatrix *csr; 4017 PetscErrorCode ierr; 4018 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4019 4020 PetscFunctionBegin; 4021 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4022 if (!i || !j) PetscFunctionReturn(0); 4023 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4024 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4025 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4026 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4027 csr = (CsrMatrix*)cusp->mat->mat; 4028 if (i) { 4029 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4030 if (!cusp->rowoffsets_gpu) { 4031 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4032 cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4033 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4034 } 4035 *i = cusp->rowoffsets_gpu->data().get(); 4036 } else *i = csr->row_offsets->data().get(); 4037 } 4038 if (j) *j = csr->column_indices->data().get(); 4039 PetscFunctionReturn(0); 4040 } 4041 4042 /*@C 4043 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 4044 4045 Not collective 4046 4047 Input Parameters: 4048 + A - the matrix 4049 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4050 4051 Output Parameters: 4052 + ia - the CSR row pointers 4053 - ja - the CSR column indices 4054 4055 Level: developer 4056 4057 .seealso: MatSeqAIJCUSPARSEGetIJ() 4058 @*/ 4059 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4060 { 4061 PetscFunctionBegin; 4062 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4063 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4064 if (i) *i = NULL; 4065 if (j) *j = NULL; 4066 PetscFunctionReturn(0); 4067 } 4068 4069 /*@C 4070 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4071 4072 Not Collective 4073 4074 Input Parameter: 4075 . A - a MATSEQAIJCUSPARSE matrix 4076 4077 Output Parameter: 4078 . a - pointer to the device data 4079 4080 Level: developer 4081 4082 Notes: may trigger host-device copies if up-to-date matrix data is on host 4083 4084 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead() 4085 @*/ 4086 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4087 { 4088 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4089 CsrMatrix *csr; 4090 PetscErrorCode ierr; 4091 4092 PetscFunctionBegin; 4093 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4094 PetscValidPointer(a,2); 4095 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4096 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4097 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4098 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4099 csr = (CsrMatrix*)cusp->mat->mat; 4100 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4101 *a = csr->values->data().get(); 4102 PetscFunctionReturn(0); 4103 } 4104 4105 /*@C 4106 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 4107 4108 Not Collective 4109 4110 Input Parameter: 4111 . A - a MATSEQAIJCUSPARSE matrix 4112 4113 Output Parameter: 4114 . a - pointer to the device data 4115 4116 Level: developer 4117 4118 .seealso: MatSeqAIJCUSPARSEGetArrayRead() 4119 @*/ 4120 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4121 { 4122 PetscFunctionBegin; 4123 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4124 PetscValidPointer(a,2); 4125 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4126 *a = NULL; 4127 PetscFunctionReturn(0); 4128 } 4129 4130 /*@C 4131 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4132 4133 Not Collective 4134 4135 Input Parameter: 4136 . A - a MATSEQAIJCUSPARSE matrix 4137 4138 Output Parameter: 4139 . a - pointer to the device data 4140 4141 Level: developer 4142 4143 Notes: may trigger host-device copies if up-to-date matrix data is on host 4144 4145 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray() 4146 @*/ 4147 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4148 { 4149 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4150 CsrMatrix *csr; 4151 PetscErrorCode ierr; 4152 4153 PetscFunctionBegin; 4154 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4155 PetscValidPointer(a,2); 4156 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4157 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4158 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4159 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4160 csr = (CsrMatrix*)cusp->mat->mat; 4161 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4162 *a = csr->values->data().get(); 4163 A->offloadmask = PETSC_OFFLOAD_GPU; 4164 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4165 PetscFunctionReturn(0); 4166 } 4167 /*@C 4168 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4169 4170 Not Collective 4171 4172 Input Parameter: 4173 . A - a MATSEQAIJCUSPARSE matrix 4174 4175 Output Parameter: 4176 . a - pointer to the device data 4177 4178 Level: developer 4179 4180 .seealso: MatSeqAIJCUSPARSEGetArray() 4181 @*/ 4182 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4183 { 4184 PetscErrorCode ierr; 4185 4186 PetscFunctionBegin; 4187 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4188 PetscValidPointer(a,2); 4189 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4190 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4191 *a = NULL; 4192 PetscFunctionReturn(0); 4193 } 4194 4195 /*@C 4196 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4197 4198 Not Collective 4199 4200 Input Parameter: 4201 . A - a MATSEQAIJCUSPARSE matrix 4202 4203 Output Parameter: 4204 . a - pointer to the device data 4205 4206 Level: developer 4207 4208 Notes: does not trigger host-device copies and flags data validity on the GPU 4209 4210 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite() 4211 @*/ 4212 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4213 { 4214 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4215 CsrMatrix *csr; 4216 PetscErrorCode ierr; 4217 4218 PetscFunctionBegin; 4219 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4220 PetscValidPointer(a,2); 4221 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4222 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4223 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4224 csr = (CsrMatrix*)cusp->mat->mat; 4225 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4226 *a = csr->values->data().get(); 4227 A->offloadmask = PETSC_OFFLOAD_GPU; 4228 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4229 PetscFunctionReturn(0); 4230 } 4231 4232 /*@C 4233 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 4234 4235 Not Collective 4236 4237 Input Parameter: 4238 . A - a MATSEQAIJCUSPARSE matrix 4239 4240 Output Parameter: 4241 . a - pointer to the device data 4242 4243 Level: developer 4244 4245 .seealso: MatSeqAIJCUSPARSEGetArrayWrite() 4246 @*/ 4247 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4248 { 4249 PetscErrorCode ierr; 4250 4251 PetscFunctionBegin; 4252 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4253 PetscValidPointer(a,2); 4254 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4255 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4256 *a = NULL; 4257 PetscFunctionReturn(0); 4258 } 4259 4260 struct IJCompare4 4261 { 4262 __host__ __device__ 4263 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4264 { 4265 if (t1.get<0>() < t2.get<0>()) return true; 4266 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4267 return false; 4268 } 4269 }; 4270 4271 struct Shift 4272 { 4273 int _shift; 4274 4275 Shift(int shift) : _shift(shift) {} 4276 __host__ __device__ 4277 inline int operator() (const int &c) 4278 { 4279 return c + _shift; 4280 } 4281 }; 4282 4283 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4284 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4285 { 4286 PetscErrorCode ierr; 4287 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4288 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4289 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4290 CsrMatrix *Acsr,*Bcsr,*Ccsr; 4291 PetscInt Annz,Bnnz; 4292 cusparseStatus_t stat; 4293 PetscInt i,m,n,zero = 0; 4294 cudaError_t cerr; 4295 4296 PetscFunctionBegin; 4297 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4298 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4299 PetscValidPointer(C,4); 4300 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4301 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 4302 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 4303 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 4304 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4305 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4306 if (reuse == MAT_INITIAL_MATRIX) { 4307 m = A->rmap->n; 4308 n = A->cmap->n + B->cmap->n; 4309 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 4310 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 4311 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 4312 c = (Mat_SeqAIJ*)(*C)->data; 4313 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4314 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4315 Ccsr = new CsrMatrix; 4316 Cmat->cprowIndices = NULL; 4317 c->compressedrow.use = PETSC_FALSE; 4318 c->compressedrow.nrows = 0; 4319 c->compressedrow.i = NULL; 4320 c->compressedrow.rindex = NULL; 4321 Ccusp->workVector = NULL; 4322 Ccusp->nrows = m; 4323 Ccusp->mat = Cmat; 4324 Ccusp->mat->mat = Ccsr; 4325 Ccsr->num_rows = m; 4326 Ccsr->num_cols = n; 4327 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 4328 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4329 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4330 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4331 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4332 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4333 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4334 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4335 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4336 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4337 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4338 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4339 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4340 4341 Acsr = (CsrMatrix*)Acusp->mat->mat; 4342 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4343 Annz = (PetscInt)Acsr->column_indices->size(); 4344 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4345 c->nz = Annz + Bnnz; 4346 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4347 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4348 Ccsr->values = new THRUSTARRAY(c->nz); 4349 Ccsr->num_entries = c->nz; 4350 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4351 if (c->nz) { 4352 auto Acoo = new THRUSTINTARRAY32(Annz); 4353 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4354 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4355 THRUSTINTARRAY32 *Aroff,*Broff; 4356 4357 if (a->compressedrow.use) { /* need full row offset */ 4358 if (!Acusp->rowoffsets_gpu) { 4359 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4360 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4361 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4362 } 4363 Aroff = Acusp->rowoffsets_gpu; 4364 } else Aroff = Acsr->row_offsets; 4365 if (b->compressedrow.use) { /* need full row offset */ 4366 if (!Bcusp->rowoffsets_gpu) { 4367 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4368 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4369 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4370 } 4371 Broff = Bcusp->rowoffsets_gpu; 4372 } else Broff = Bcsr->row_offsets; 4373 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4374 stat = cusparseXcsr2coo(Acusp->handle, 4375 Aroff->data().get(), 4376 Annz, 4377 m, 4378 Acoo->data().get(), 4379 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4380 stat = cusparseXcsr2coo(Bcusp->handle, 4381 Broff->data().get(), 4382 Bnnz, 4383 m, 4384 Bcoo->data().get(), 4385 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4386 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4387 auto Aperm = thrust::make_constant_iterator(1); 4388 auto Bperm = thrust::make_constant_iterator(0); 4389 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4390 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4391 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4392 #else 4393 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4394 auto Bcib = Bcsr->column_indices->begin(); 4395 auto Bcie = Bcsr->column_indices->end(); 4396 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4397 #endif 4398 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4399 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4400 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4401 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4402 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4403 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4404 auto p1 = Ccusp->cooPerm->begin(); 4405 auto p2 = Ccusp->cooPerm->begin(); 4406 thrust::advance(p2,Annz); 4407 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4408 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4409 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4410 #endif 4411 auto cci = thrust::make_counting_iterator(zero); 4412 auto cce = thrust::make_counting_iterator(c->nz); 4413 #if 0 //Errors on SUMMIT cuda 11.1.0 4414 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4415 #else 4416 auto pred = thrust::identity<int>(); 4417 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4418 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4419 #endif 4420 stat = cusparseXcoo2csr(Ccusp->handle, 4421 Ccoo->data().get(), 4422 c->nz, 4423 m, 4424 Ccsr->row_offsets->data().get(), 4425 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4426 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4427 delete wPerm; 4428 delete Acoo; 4429 delete Bcoo; 4430 delete Ccoo; 4431 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4432 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4433 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4434 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4435 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4436 #endif 4437 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4438 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 4439 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 4440 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4441 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4442 CsrMatrix *CcsrT = new CsrMatrix; 4443 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4444 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4445 4446 (*C)->form_explicit_transpose = PETSC_TRUE; 4447 (*C)->transupdated = PETSC_TRUE; 4448 Ccusp->rowoffsets_gpu = NULL; 4449 CmatT->cprowIndices = NULL; 4450 CmatT->mat = CcsrT; 4451 CcsrT->num_rows = n; 4452 CcsrT->num_cols = m; 4453 CcsrT->num_entries = c->nz; 4454 4455 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4456 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4457 CcsrT->values = new THRUSTARRAY(c->nz); 4458 4459 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4460 auto rT = CcsrT->row_offsets->begin(); 4461 if (AT) { 4462 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4463 thrust::advance(rT,-1); 4464 } 4465 if (BT) { 4466 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4467 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4468 thrust::copy(titb,tite,rT); 4469 } 4470 auto cT = CcsrT->column_indices->begin(); 4471 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4472 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4473 auto vT = CcsrT->values->begin(); 4474 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4475 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4476 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4477 4478 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4479 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4480 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4481 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4482 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4483 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4484 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4485 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4486 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4487 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4488 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4489 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4490 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4491 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4492 #endif 4493 Ccusp->matTranspose = CmatT; 4494 } 4495 } 4496 4497 c->singlemalloc = PETSC_FALSE; 4498 c->free_a = PETSC_TRUE; 4499 c->free_ij = PETSC_TRUE; 4500 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4501 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4502 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4503 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4504 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4505 ii = *Ccsr->row_offsets; 4506 jj = *Ccsr->column_indices; 4507 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4508 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4509 } else { 4510 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4511 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4512 } 4513 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4514 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4515 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4516 c->maxnz = c->nz; 4517 c->nonzerorowcnt = 0; 4518 c->rmax = 0; 4519 for (i = 0; i < m; i++) { 4520 const PetscInt nn = c->i[i+1] - c->i[i]; 4521 c->ilen[i] = c->imax[i] = nn; 4522 c->nonzerorowcnt += (PetscInt)!!nn; 4523 c->rmax = PetscMax(c->rmax,nn); 4524 } 4525 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4526 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4527 (*C)->nonzerostate++; 4528 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4529 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4530 Ccusp->nonzerostate = (*C)->nonzerostate; 4531 (*C)->preallocated = PETSC_TRUE; 4532 } else { 4533 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4534 c = (Mat_SeqAIJ*)(*C)->data; 4535 if (c->nz) { 4536 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4537 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4538 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4539 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4540 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4541 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4542 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4543 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4544 Acsr = (CsrMatrix*)Acusp->mat->mat; 4545 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4546 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4547 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4548 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4549 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4550 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4551 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4552 auto pmid = Ccusp->cooPerm->begin(); 4553 thrust::advance(pmid,Acsr->num_entries); 4554 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4555 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4556 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4557 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4558 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4559 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4560 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4561 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4562 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4563 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4564 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4565 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4566 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4567 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4568 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4569 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4570 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4571 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4572 auto vT = CcsrT->values->begin(); 4573 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4574 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4575 (*C)->transupdated = PETSC_TRUE; 4576 } 4577 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4578 } 4579 } 4580 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4581 (*C)->assembled = PETSC_TRUE; 4582 (*C)->was_assembled = PETSC_FALSE; 4583 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4584 PetscFunctionReturn(0); 4585 } 4586 4587 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4588 { 4589 PetscErrorCode ierr; 4590 bool dmem; 4591 const PetscScalar *av; 4592 cudaError_t cerr; 4593 4594 PetscFunctionBegin; 4595 dmem = isCudaMem(v); 4596 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4597 if (n && idx) { 4598 THRUSTINTARRAY widx(n); 4599 widx.assign(idx,idx+n); 4600 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4601 4602 THRUSTARRAY *w = NULL; 4603 thrust::device_ptr<PetscScalar> dv; 4604 if (dmem) { 4605 dv = thrust::device_pointer_cast(v); 4606 } else { 4607 w = new THRUSTARRAY(n); 4608 dv = w->data(); 4609 } 4610 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4611 4612 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4613 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4614 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4615 if (w) { 4616 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4617 } 4618 delete w; 4619 } else { 4620 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4621 } 4622 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4623 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4624 PetscFunctionReturn(0); 4625 } 4626