1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/async/for_each.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 76 77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 82 83 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 85 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 86 87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 89 90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 91 92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 93 { 94 cusparseStatus_t stat; 95 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 96 97 PetscFunctionBegin; 98 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 99 cusparsestruct->stream = stream; 100 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 101 PetscFunctionReturn(0); 102 } 103 104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 105 { 106 cusparseStatus_t stat; 107 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 108 109 PetscFunctionBegin; 110 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 111 if (cusparsestruct->handle != handle) { 112 if (cusparsestruct->handle) { 113 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 114 } 115 cusparsestruct->handle = handle; 116 } 117 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 118 PetscFunctionReturn(0); 119 } 120 121 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 122 { 123 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 124 PetscBool flg; 125 PetscErrorCode ierr; 126 127 PetscFunctionBegin; 128 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 129 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 130 if (cusparsestruct->handle) cusparsestruct->handle = 0; 131 PetscFunctionReturn(0); 132 } 133 134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 135 { 136 PetscFunctionBegin; 137 *type = MATSOLVERCUSPARSE; 138 PetscFunctionReturn(0); 139 } 140 141 /*MC 142 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 143 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 144 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 145 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 146 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 147 algorithms are not recommended. This class does NOT support direct solver operations. 148 149 Level: beginner 150 151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 152 M*/ 153 154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 155 { 156 PetscErrorCode ierr; 157 PetscInt n = A->rmap->n; 158 159 PetscFunctionBegin; 160 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 161 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 162 (*B)->factortype = ftype; 163 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 164 165 if (A->boundtocpu && A->bindingpropagates) { ierr = MatBindToCPU(*B,PETSC_TRUE);CHKERRQ(ierr); } 166 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 167 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 168 if (!A->boundtocpu) { 169 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 170 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 171 } else { 172 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 173 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 174 } 175 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 176 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 177 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 178 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 179 if (!A->boundtocpu) { 180 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 181 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 182 } else { 183 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 184 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 185 } 186 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 187 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 188 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 189 190 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 191 (*B)->canuseordering = PETSC_TRUE; 192 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 193 PetscFunctionReturn(0); 194 } 195 196 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 197 { 198 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 199 200 PetscFunctionBegin; 201 switch (op) { 202 case MAT_CUSPARSE_MULT: 203 cusparsestruct->format = format; 204 break; 205 case MAT_CUSPARSE_ALL: 206 cusparsestruct->format = format; 207 break; 208 default: 209 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 210 } 211 PetscFunctionReturn(0); 212 } 213 214 /*@ 215 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 216 operation. Only the MatMult operation can use different GPU storage formats 217 for MPIAIJCUSPARSE matrices. 218 Not Collective 219 220 Input Parameters: 221 + A - Matrix of type SEQAIJCUSPARSE 222 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 223 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 224 225 Output Parameter: 226 227 Level: intermediate 228 229 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 230 @*/ 231 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 232 { 233 PetscErrorCode ierr; 234 235 PetscFunctionBegin; 236 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 237 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 238 PetscFunctionReturn(0); 239 } 240 241 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 242 { 243 PetscErrorCode ierr; 244 245 PetscFunctionBegin; 246 switch (op) { 247 case MAT_FORM_EXPLICIT_TRANSPOSE: 248 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 249 if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 250 A->form_explicit_transpose = flg; 251 break; 252 default: 253 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 254 break; 255 } 256 PetscFunctionReturn(0); 257 } 258 259 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 260 261 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 262 { 263 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 264 IS isrow = b->row,iscol = b->col; 265 PetscBool row_identity,col_identity; 266 PetscErrorCode ierr; 267 268 PetscFunctionBegin; 269 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 270 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 271 B->offloadmask = PETSC_OFFLOAD_CPU; 272 /* determine which version of MatSolve needs to be used. */ 273 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 274 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 275 if (row_identity && col_identity) { 276 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 277 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 278 B->ops->matsolve = NULL; 279 B->ops->matsolvetranspose = NULL; 280 } else { 281 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 282 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 283 B->ops->matsolve = NULL; 284 B->ops->matsolvetranspose = NULL; 285 } 286 287 /* get the triangular factors */ 288 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 289 PetscFunctionReturn(0); 290 } 291 292 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 293 { 294 PetscErrorCode ierr; 295 MatCUSPARSEStorageFormat format; 296 PetscBool flg; 297 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 298 299 PetscFunctionBegin; 300 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 301 if (A->factortype == MAT_FACTOR_NONE) { 302 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 303 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 304 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 305 306 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 307 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 308 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 309 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 310 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 311 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 312 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 313 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 314 if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 315 #else 316 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 317 #endif 318 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 319 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 320 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 321 322 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 323 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 324 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 325 #endif 326 } 327 ierr = PetscOptionsTail();CHKERRQ(ierr); 328 PetscFunctionReturn(0); 329 } 330 331 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 332 { 333 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 334 PetscErrorCode ierr; 335 336 PetscFunctionBegin; 337 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 338 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 339 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 340 PetscFunctionReturn(0); 341 } 342 343 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 344 { 345 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 346 PetscErrorCode ierr; 347 348 PetscFunctionBegin; 349 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 350 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 351 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 352 PetscFunctionReturn(0); 353 } 354 355 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 356 { 357 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 358 PetscErrorCode ierr; 359 360 PetscFunctionBegin; 361 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 362 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 363 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 364 PetscFunctionReturn(0); 365 } 366 367 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 368 { 369 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 370 PetscErrorCode ierr; 371 372 PetscFunctionBegin; 373 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 374 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 375 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 376 PetscFunctionReturn(0); 377 } 378 379 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 380 { 381 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 382 PetscInt n = A->rmap->n; 383 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 384 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 385 cusparseStatus_t stat; 386 const PetscInt *ai = a->i,*aj = a->j,*vi; 387 const MatScalar *aa = a->a,*v; 388 PetscInt *AiLo, *AjLo; 389 PetscInt i,nz, nzLower, offset, rowOffset; 390 PetscErrorCode ierr; 391 cudaError_t cerr; 392 393 PetscFunctionBegin; 394 if (!n) PetscFunctionReturn(0); 395 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 396 try { 397 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 398 nzLower=n+ai[n]-ai[1]; 399 if (!loTriFactor) { 400 PetscScalar *AALo; 401 402 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 403 404 /* Allocate Space for the lower triangular matrix */ 405 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 406 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 407 408 /* Fill the lower triangular matrix */ 409 AiLo[0] = (PetscInt) 0; 410 AiLo[n] = nzLower; 411 AjLo[0] = (PetscInt) 0; 412 AALo[0] = (MatScalar) 1.0; 413 v = aa; 414 vi = aj; 415 offset = 1; 416 rowOffset= 1; 417 for (i=1; i<n; i++) { 418 nz = ai[i+1] - ai[i]; 419 /* additional 1 for the term on the diagonal */ 420 AiLo[i] = rowOffset; 421 rowOffset += nz+1; 422 423 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 424 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 425 426 offset += nz; 427 AjLo[offset] = (PetscInt) i; 428 AALo[offset] = (MatScalar) 1.0; 429 offset += 1; 430 431 v += nz; 432 vi += nz; 433 } 434 435 /* allocate space for the triangular factor information */ 436 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 437 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 438 /* Create the matrix description */ 439 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 440 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 441 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 442 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 443 #else 444 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 445 #endif 446 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 447 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 448 449 /* set the operation */ 450 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 451 452 /* set the matrix */ 453 loTriFactor->csrMat = new CsrMatrix; 454 loTriFactor->csrMat->num_rows = n; 455 loTriFactor->csrMat->num_cols = n; 456 loTriFactor->csrMat->num_entries = nzLower; 457 458 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 459 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 460 461 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 462 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 463 464 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 465 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 466 467 /* Create the solve analysis information */ 468 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 469 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 470 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 471 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 472 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 473 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 474 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 475 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 476 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 477 #endif 478 479 /* perform the solve analysis */ 480 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 481 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 482 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 483 loTriFactor->csrMat->column_indices->data().get(), 484 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 485 loTriFactor->solveInfo, 486 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 487 #else 488 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 489 #endif 490 cerr = WaitForCUDA();CHKERRCUDA(cerr); 491 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 492 493 /* assign the pointer */ 494 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 495 loTriFactor->AA_h = AALo; 496 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 497 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 498 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 499 } else { /* update values only */ 500 if (!loTriFactor->AA_h) { 501 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 502 } 503 /* Fill the lower triangular matrix */ 504 loTriFactor->AA_h[0] = 1.0; 505 v = aa; 506 vi = aj; 507 offset = 1; 508 for (i=1; i<n; i++) { 509 nz = ai[i+1] - ai[i]; 510 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 511 offset += nz; 512 loTriFactor->AA_h[offset] = 1.0; 513 offset += 1; 514 v += nz; 515 } 516 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 517 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 518 } 519 } catch(char *ex) { 520 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 521 } 522 } 523 PetscFunctionReturn(0); 524 } 525 526 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 527 { 528 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 529 PetscInt n = A->rmap->n; 530 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 531 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 532 cusparseStatus_t stat; 533 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 534 const MatScalar *aa = a->a,*v; 535 PetscInt *AiUp, *AjUp; 536 PetscInt i,nz, nzUpper, offset; 537 PetscErrorCode ierr; 538 cudaError_t cerr; 539 540 PetscFunctionBegin; 541 if (!n) PetscFunctionReturn(0); 542 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 543 try { 544 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 545 nzUpper = adiag[0]-adiag[n]; 546 if (!upTriFactor) { 547 PetscScalar *AAUp; 548 549 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 550 551 /* Allocate Space for the upper triangular matrix */ 552 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 553 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 554 555 /* Fill the upper triangular matrix */ 556 AiUp[0]=(PetscInt) 0; 557 AiUp[n]=nzUpper; 558 offset = nzUpper; 559 for (i=n-1; i>=0; i--) { 560 v = aa + adiag[i+1] + 1; 561 vi = aj + adiag[i+1] + 1; 562 563 /* number of elements NOT on the diagonal */ 564 nz = adiag[i] - adiag[i+1]-1; 565 566 /* decrement the offset */ 567 offset -= (nz+1); 568 569 /* first, set the diagonal elements */ 570 AjUp[offset] = (PetscInt) i; 571 AAUp[offset] = (MatScalar)1./v[nz]; 572 AiUp[i] = AiUp[i+1] - (nz+1); 573 574 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 575 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 576 } 577 578 /* allocate space for the triangular factor information */ 579 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 580 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 581 582 /* Create the matrix description */ 583 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 584 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 585 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 586 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 587 #else 588 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 589 #endif 590 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 591 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 592 593 /* set the operation */ 594 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 595 596 /* set the matrix */ 597 upTriFactor->csrMat = new CsrMatrix; 598 upTriFactor->csrMat->num_rows = n; 599 upTriFactor->csrMat->num_cols = n; 600 upTriFactor->csrMat->num_entries = nzUpper; 601 602 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 603 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 604 605 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 606 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 607 608 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 609 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 610 611 /* Create the solve analysis information */ 612 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 613 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 614 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 615 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 616 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 617 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 618 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 619 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 620 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 621 #endif 622 623 /* perform the solve analysis */ 624 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 625 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 626 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 627 upTriFactor->csrMat->column_indices->data().get(), 628 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 629 upTriFactor->solveInfo, 630 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 631 #else 632 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 633 #endif 634 cerr = WaitForCUDA();CHKERRCUDA(cerr); 635 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 636 637 /* assign the pointer */ 638 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 639 upTriFactor->AA_h = AAUp; 640 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 641 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 642 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 643 } else { 644 if (!upTriFactor->AA_h) { 645 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 646 } 647 /* Fill the upper triangular matrix */ 648 offset = nzUpper; 649 for (i=n-1; i>=0; i--) { 650 v = aa + adiag[i+1] + 1; 651 652 /* number of elements NOT on the diagonal */ 653 nz = adiag[i] - adiag[i+1]-1; 654 655 /* decrement the offset */ 656 offset -= (nz+1); 657 658 /* first, set the diagonal elements */ 659 upTriFactor->AA_h[offset] = 1./v[nz]; 660 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 661 } 662 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 663 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 664 } 665 } catch(char *ex) { 666 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 667 } 668 } 669 PetscFunctionReturn(0); 670 } 671 672 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 673 { 674 PetscErrorCode ierr; 675 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 676 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 677 IS isrow = a->row,iscol = a->icol; 678 PetscBool row_identity,col_identity; 679 PetscInt n = A->rmap->n; 680 681 PetscFunctionBegin; 682 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 683 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 684 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 685 686 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 687 cusparseTriFactors->nnz=a->nz; 688 689 A->offloadmask = PETSC_OFFLOAD_BOTH; 690 /* lower triangular indices */ 691 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 692 if (!row_identity && !cusparseTriFactors->rpermIndices) { 693 const PetscInt *r; 694 695 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 696 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 697 cusparseTriFactors->rpermIndices->assign(r, r+n); 698 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 699 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 700 } 701 702 /* upper triangular indices */ 703 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 704 if (!col_identity && !cusparseTriFactors->cpermIndices) { 705 const PetscInt *c; 706 707 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 708 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 709 cusparseTriFactors->cpermIndices->assign(c, c+n); 710 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 711 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 712 } 713 PetscFunctionReturn(0); 714 } 715 716 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 717 { 718 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 719 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 720 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 721 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 722 cusparseStatus_t stat; 723 PetscErrorCode ierr; 724 cudaError_t cerr; 725 PetscInt *AiUp, *AjUp; 726 PetscScalar *AAUp; 727 PetscScalar *AALo; 728 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 729 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 730 const PetscInt *ai = b->i,*aj = b->j,*vj; 731 const MatScalar *aa = b->a,*v; 732 733 PetscFunctionBegin; 734 if (!n) PetscFunctionReturn(0); 735 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 736 try { 737 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 738 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 739 if (!upTriFactor && !loTriFactor) { 740 /* Allocate Space for the upper triangular matrix */ 741 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 742 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 743 744 /* Fill the upper triangular matrix */ 745 AiUp[0]=(PetscInt) 0; 746 AiUp[n]=nzUpper; 747 offset = 0; 748 for (i=0; i<n; i++) { 749 /* set the pointers */ 750 v = aa + ai[i]; 751 vj = aj + ai[i]; 752 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 753 754 /* first, set the diagonal elements */ 755 AjUp[offset] = (PetscInt) i; 756 AAUp[offset] = (MatScalar)1.0/v[nz]; 757 AiUp[i] = offset; 758 AALo[offset] = (MatScalar)1.0/v[nz]; 759 760 offset+=1; 761 if (nz>0) { 762 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 763 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 764 for (j=offset; j<offset+nz; j++) { 765 AAUp[j] = -AAUp[j]; 766 AALo[j] = AAUp[j]/v[nz]; 767 } 768 offset+=nz; 769 } 770 } 771 772 /* allocate space for the triangular factor information */ 773 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 774 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 775 776 /* Create the matrix description */ 777 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 778 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 779 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 780 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 781 #else 782 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 783 #endif 784 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 785 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 786 787 /* set the matrix */ 788 upTriFactor->csrMat = new CsrMatrix; 789 upTriFactor->csrMat->num_rows = A->rmap->n; 790 upTriFactor->csrMat->num_cols = A->cmap->n; 791 upTriFactor->csrMat->num_entries = a->nz; 792 793 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 794 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 795 796 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 797 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 798 799 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 800 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 801 802 /* set the operation */ 803 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 804 805 /* Create the solve analysis information */ 806 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 807 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 808 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 809 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 810 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 811 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 812 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 813 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 814 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 815 #endif 816 817 /* perform the solve analysis */ 818 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 819 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 820 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 821 upTriFactor->csrMat->column_indices->data().get(), 822 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 823 upTriFactor->solveInfo, 824 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 825 #else 826 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 827 #endif 828 cerr = WaitForCUDA();CHKERRCUDA(cerr); 829 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 830 831 /* assign the pointer */ 832 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 833 834 /* allocate space for the triangular factor information */ 835 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 836 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 837 838 /* Create the matrix description */ 839 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 840 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 841 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 842 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 843 #else 844 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 845 #endif 846 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 847 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 848 849 /* set the operation */ 850 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 851 852 /* set the matrix */ 853 loTriFactor->csrMat = new CsrMatrix; 854 loTriFactor->csrMat->num_rows = A->rmap->n; 855 loTriFactor->csrMat->num_cols = A->cmap->n; 856 loTriFactor->csrMat->num_entries = a->nz; 857 858 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 859 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 860 861 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 862 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 863 864 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 865 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 866 867 /* Create the solve analysis information */ 868 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 869 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 870 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 871 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 872 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 873 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 874 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 875 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 876 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 877 #endif 878 879 /* perform the solve analysis */ 880 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 881 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 882 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 883 loTriFactor->csrMat->column_indices->data().get(), 884 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 885 loTriFactor->solveInfo, 886 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 887 #else 888 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 889 #endif 890 cerr = WaitForCUDA();CHKERRCUDA(cerr); 891 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 892 893 /* assign the pointer */ 894 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 895 896 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 897 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 898 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 899 } else { 900 /* Fill the upper triangular matrix */ 901 offset = 0; 902 for (i=0; i<n; i++) { 903 /* set the pointers */ 904 v = aa + ai[i]; 905 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 906 907 /* first, set the diagonal elements */ 908 AAUp[offset] = 1.0/v[nz]; 909 AALo[offset] = 1.0/v[nz]; 910 911 offset+=1; 912 if (nz>0) { 913 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 914 for (j=offset; j<offset+nz; j++) { 915 AAUp[j] = -AAUp[j]; 916 AALo[j] = AAUp[j]/v[nz]; 917 } 918 offset+=nz; 919 } 920 } 921 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 922 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 923 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 924 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 925 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 926 } 927 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 928 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 929 } catch(char *ex) { 930 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 931 } 932 } 933 PetscFunctionReturn(0); 934 } 935 936 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 937 { 938 PetscErrorCode ierr; 939 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 940 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 941 IS ip = a->row; 942 PetscBool perm_identity; 943 PetscInt n = A->rmap->n; 944 945 PetscFunctionBegin; 946 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 947 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 948 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 949 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 950 951 A->offloadmask = PETSC_OFFLOAD_BOTH; 952 953 /* lower triangular indices */ 954 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 955 if (!perm_identity) { 956 IS iip; 957 const PetscInt *irip,*rip; 958 959 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 960 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 961 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 962 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 963 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 964 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 965 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 966 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 967 ierr = ISDestroy(&iip);CHKERRQ(ierr); 968 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 969 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 970 } 971 PetscFunctionReturn(0); 972 } 973 974 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 975 { 976 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 977 IS ip = b->row; 978 PetscBool perm_identity; 979 PetscErrorCode ierr; 980 981 PetscFunctionBegin; 982 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 983 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 984 B->offloadmask = PETSC_OFFLOAD_CPU; 985 /* determine which version of MatSolve needs to be used. */ 986 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 987 if (perm_identity) { 988 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 989 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 990 B->ops->matsolve = NULL; 991 B->ops->matsolvetranspose = NULL; 992 } else { 993 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 994 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 995 B->ops->matsolve = NULL; 996 B->ops->matsolvetranspose = NULL; 997 } 998 999 /* get the triangular factors */ 1000 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 1001 PetscFunctionReturn(0); 1002 } 1003 1004 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1005 { 1006 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1007 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1008 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1009 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1010 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1011 cusparseStatus_t stat; 1012 cusparseIndexBase_t indexBase; 1013 cusparseMatrixType_t matrixType; 1014 cusparseFillMode_t fillMode; 1015 cusparseDiagType_t diagType; 1016 cudaError_t cerr; 1017 PetscErrorCode ierr; 1018 1019 PetscFunctionBegin; 1020 /* allocate space for the transpose of the lower triangular factor */ 1021 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1022 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1023 1024 /* set the matrix descriptors of the lower triangular factor */ 1025 matrixType = cusparseGetMatType(loTriFactor->descr); 1026 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1027 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1028 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1029 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1030 1031 /* Create the matrix description */ 1032 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1033 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1034 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1035 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1036 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1037 1038 /* set the operation */ 1039 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1040 1041 /* allocate GPU space for the CSC of the lower triangular factor*/ 1042 loTriFactorT->csrMat = new CsrMatrix; 1043 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1044 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1045 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1046 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1047 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1048 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1049 1050 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1051 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1052 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1053 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1054 loTriFactor->csrMat->values->data().get(), 1055 loTriFactor->csrMat->row_offsets->data().get(), 1056 loTriFactor->csrMat->column_indices->data().get(), 1057 loTriFactorT->csrMat->values->data().get(), 1058 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1059 CUSPARSE_ACTION_NUMERIC,indexBase, 1060 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1061 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1062 #endif 1063 1064 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1065 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1066 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1067 loTriFactor->csrMat->values->data().get(), 1068 loTriFactor->csrMat->row_offsets->data().get(), 1069 loTriFactor->csrMat->column_indices->data().get(), 1070 loTriFactorT->csrMat->values->data().get(), 1071 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1072 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1073 CUSPARSE_ACTION_NUMERIC, indexBase, 1074 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1075 #else 1076 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1077 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1078 #endif 1079 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1080 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1081 1082 /* Create the solve analysis information */ 1083 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1084 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1085 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1086 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1087 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1088 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1089 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1090 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1091 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1092 #endif 1093 1094 /* perform the solve analysis */ 1095 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1096 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1097 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1098 loTriFactorT->csrMat->column_indices->data().get(), 1099 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1100 loTriFactorT->solveInfo, 1101 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1102 #else 1103 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1104 #endif 1105 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1106 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1107 1108 /* assign the pointer */ 1109 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1110 1111 /*********************************************/ 1112 /* Now the Transpose of the Upper Tri Factor */ 1113 /*********************************************/ 1114 1115 /* allocate space for the transpose of the upper triangular factor */ 1116 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1117 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1118 1119 /* set the matrix descriptors of the upper triangular factor */ 1120 matrixType = cusparseGetMatType(upTriFactor->descr); 1121 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1122 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1123 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1124 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1125 1126 /* Create the matrix description */ 1127 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1128 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1129 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1130 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1131 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1132 1133 /* set the operation */ 1134 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1135 1136 /* allocate GPU space for the CSC of the upper triangular factor*/ 1137 upTriFactorT->csrMat = new CsrMatrix; 1138 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1139 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1140 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1141 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1142 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1143 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1144 1145 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1146 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1147 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1148 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1149 upTriFactor->csrMat->values->data().get(), 1150 upTriFactor->csrMat->row_offsets->data().get(), 1151 upTriFactor->csrMat->column_indices->data().get(), 1152 upTriFactorT->csrMat->values->data().get(), 1153 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1154 CUSPARSE_ACTION_NUMERIC,indexBase, 1155 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1156 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1157 #endif 1158 1159 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1160 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1161 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1162 upTriFactor->csrMat->values->data().get(), 1163 upTriFactor->csrMat->row_offsets->data().get(), 1164 upTriFactor->csrMat->column_indices->data().get(), 1165 upTriFactorT->csrMat->values->data().get(), 1166 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1167 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1168 CUSPARSE_ACTION_NUMERIC, indexBase, 1169 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1170 #else 1171 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1172 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1173 #endif 1174 1175 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1176 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1177 1178 /* Create the solve analysis information */ 1179 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1180 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1181 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1182 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1183 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1184 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1185 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1186 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1187 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1188 #endif 1189 1190 /* perform the solve analysis */ 1191 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1192 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1193 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1194 upTriFactorT->csrMat->column_indices->data().get(), 1195 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1196 upTriFactorT->solveInfo, 1197 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1198 #else 1199 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1200 #endif 1201 1202 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1203 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1204 1205 /* assign the pointer */ 1206 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1207 PetscFunctionReturn(0); 1208 } 1209 1210 struct PetscScalarToPetscInt 1211 { 1212 __host__ __device__ 1213 PetscInt operator()(PetscScalar s) 1214 { 1215 return (PetscInt)PetscRealPart(s); 1216 } 1217 }; 1218 1219 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1220 { 1221 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1222 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1223 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1224 cusparseStatus_t stat; 1225 cusparseIndexBase_t indexBase; 1226 cudaError_t err; 1227 PetscErrorCode ierr; 1228 1229 PetscFunctionBegin; 1230 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1231 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1232 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1233 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1234 if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1235 if (A->transupdated) PetscFunctionReturn(0); 1236 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1237 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1238 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1239 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1240 } 1241 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1242 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1243 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1244 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1245 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1246 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1247 1248 /* set alpha and beta */ 1249 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1250 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1251 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1252 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1253 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1254 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1255 1256 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1257 CsrMatrix *matrixT = new CsrMatrix; 1258 matstructT->mat = matrixT; 1259 matrixT->num_rows = A->cmap->n; 1260 matrixT->num_cols = A->rmap->n; 1261 matrixT->num_entries = a->nz; 1262 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1263 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1264 matrixT->values = new THRUSTARRAY(a->nz); 1265 1266 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1267 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1268 1269 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1270 #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1271 stat = cusparseCreateCsr(&matstructT->matDescr, 1272 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1273 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1274 matrixT->values->data().get(), 1275 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1276 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1277 #else 1278 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1279 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1280 1281 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1282 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1283 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1284 */ 1285 if (matrixT->num_entries) { 1286 stat = cusparseCreateCsr(&matstructT->matDescr, 1287 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1288 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1289 matrixT->values->data().get(), 1290 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 1291 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1292 1293 } else { 1294 matstructT->matDescr = NULL; 1295 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1296 } 1297 #endif 1298 #endif 1299 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1300 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1301 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1302 #else 1303 CsrMatrix *temp = new CsrMatrix; 1304 CsrMatrix *tempT = new CsrMatrix; 1305 /* First convert HYB to CSR */ 1306 temp->num_rows = A->rmap->n; 1307 temp->num_cols = A->cmap->n; 1308 temp->num_entries = a->nz; 1309 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1310 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1311 temp->values = new THRUSTARRAY(a->nz); 1312 1313 stat = cusparse_hyb2csr(cusparsestruct->handle, 1314 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1315 temp->values->data().get(), 1316 temp->row_offsets->data().get(), 1317 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1318 1319 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1320 tempT->num_rows = A->rmap->n; 1321 tempT->num_cols = A->cmap->n; 1322 tempT->num_entries = a->nz; 1323 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1324 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1325 tempT->values = new THRUSTARRAY(a->nz); 1326 1327 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1328 temp->num_cols, temp->num_entries, 1329 temp->values->data().get(), 1330 temp->row_offsets->data().get(), 1331 temp->column_indices->data().get(), 1332 tempT->values->data().get(), 1333 tempT->column_indices->data().get(), 1334 tempT->row_offsets->data().get(), 1335 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1336 1337 /* Last, convert CSC to HYB */ 1338 cusparseHybMat_t hybMat; 1339 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1340 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1341 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1342 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1343 matstructT->descr, tempT->values->data().get(), 1344 tempT->row_offsets->data().get(), 1345 tempT->column_indices->data().get(), 1346 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1347 1348 /* assign the pointer */ 1349 matstructT->mat = hybMat; 1350 A->transupdated = PETSC_TRUE; 1351 /* delete temporaries */ 1352 if (tempT) { 1353 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1354 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1355 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1356 delete (CsrMatrix*) tempT; 1357 } 1358 if (temp) { 1359 if (temp->values) delete (THRUSTARRAY*) temp->values; 1360 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1361 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1362 delete (CsrMatrix*) temp; 1363 } 1364 #endif 1365 } 1366 } 1367 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1368 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1369 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1370 if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1371 if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1372 if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1373 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1374 if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1375 if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1376 if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1377 if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1378 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1379 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1380 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1381 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1382 } 1383 if (!cusparsestruct->csr2csc_i) { 1384 THRUSTARRAY csr2csc_a(matrix->num_entries); 1385 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1386 1387 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1388 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1389 void *csr2cscBuffer; 1390 size_t csr2cscBufferSize; 1391 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1392 A->cmap->n, matrix->num_entries, 1393 matrix->values->data().get(), 1394 cusparsestruct->rowoffsets_gpu->data().get(), 1395 matrix->column_indices->data().get(), 1396 matrixT->values->data().get(), 1397 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1398 CUSPARSE_ACTION_NUMERIC,indexBase, 1399 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1400 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1401 #endif 1402 1403 if (matrix->num_entries) { 1404 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1405 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1406 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1407 1408 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1409 should be filled with indexBase. So I just take a shortcut here. 1410 */ 1411 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1412 A->cmap->n,matrix->num_entries, 1413 csr2csc_a.data().get(), 1414 cusparsestruct->rowoffsets_gpu->data().get(), 1415 matrix->column_indices->data().get(), 1416 matrixT->values->data().get(), 1417 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1418 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1419 CUSPARSE_ACTION_NUMERIC,indexBase, 1420 cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1421 #else 1422 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1423 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1424 #endif 1425 } else { 1426 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1427 } 1428 1429 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1430 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1431 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1432 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1433 #endif 1434 } 1435 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1436 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1437 matrixT->values->begin())); 1438 } 1439 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1440 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1441 /* the compressed row indices is not used for matTranspose */ 1442 matstructT->cprowIndices = NULL; 1443 /* assign the pointer */ 1444 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1445 A->transupdated = PETSC_TRUE; 1446 PetscFunctionReturn(0); 1447 } 1448 1449 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1450 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1451 { 1452 PetscInt n = xx->map->n; 1453 const PetscScalar *barray; 1454 PetscScalar *xarray; 1455 thrust::device_ptr<const PetscScalar> bGPU; 1456 thrust::device_ptr<PetscScalar> xGPU; 1457 cusparseStatus_t stat; 1458 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1459 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1460 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1461 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1462 PetscErrorCode ierr; 1463 1464 PetscFunctionBegin; 1465 /* Analyze the matrix and create the transpose ... on the fly */ 1466 if (!loTriFactorT && !upTriFactorT) { 1467 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1468 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1469 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1470 } 1471 1472 /* Get the GPU pointers */ 1473 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1474 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1475 xGPU = thrust::device_pointer_cast(xarray); 1476 bGPU = thrust::device_pointer_cast(barray); 1477 1478 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1479 /* First, reorder with the row permutation */ 1480 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1481 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1482 xGPU); 1483 1484 /* First, solve U */ 1485 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1486 upTriFactorT->csrMat->num_rows, 1487 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1488 upTriFactorT->csrMat->num_entries, 1489 #endif 1490 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1491 upTriFactorT->csrMat->values->data().get(), 1492 upTriFactorT->csrMat->row_offsets->data().get(), 1493 upTriFactorT->csrMat->column_indices->data().get(), 1494 upTriFactorT->solveInfo, 1495 xarray, 1496 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1497 tempGPU->data().get(), 1498 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1499 #else 1500 tempGPU->data().get());CHKERRCUSPARSE(stat); 1501 #endif 1502 1503 /* Then, solve L */ 1504 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1505 loTriFactorT->csrMat->num_rows, 1506 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1507 loTriFactorT->csrMat->num_entries, 1508 #endif 1509 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1510 loTriFactorT->csrMat->values->data().get(), 1511 loTriFactorT->csrMat->row_offsets->data().get(), 1512 loTriFactorT->csrMat->column_indices->data().get(), 1513 loTriFactorT->solveInfo, 1514 tempGPU->data().get(), 1515 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1516 xarray, 1517 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1518 #else 1519 xarray);CHKERRCUSPARSE(stat); 1520 #endif 1521 1522 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1523 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1524 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1525 tempGPU->begin()); 1526 1527 /* Copy the temporary to the full solution. */ 1528 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1529 1530 /* restore */ 1531 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1532 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1533 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1534 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1535 PetscFunctionReturn(0); 1536 } 1537 1538 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1539 { 1540 const PetscScalar *barray; 1541 PetscScalar *xarray; 1542 cusparseStatus_t stat; 1543 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1544 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1545 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1546 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1547 PetscErrorCode ierr; 1548 1549 PetscFunctionBegin; 1550 /* Analyze the matrix and create the transpose ... on the fly */ 1551 if (!loTriFactorT && !upTriFactorT) { 1552 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1553 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1554 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1555 } 1556 1557 /* Get the GPU pointers */ 1558 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1559 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1560 1561 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1562 /* First, solve U */ 1563 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1564 upTriFactorT->csrMat->num_rows, 1565 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1566 upTriFactorT->csrMat->num_entries, 1567 #endif 1568 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1569 upTriFactorT->csrMat->values->data().get(), 1570 upTriFactorT->csrMat->row_offsets->data().get(), 1571 upTriFactorT->csrMat->column_indices->data().get(), 1572 upTriFactorT->solveInfo, 1573 barray, 1574 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1575 tempGPU->data().get(), 1576 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1577 #else 1578 tempGPU->data().get());CHKERRCUSPARSE(stat); 1579 #endif 1580 1581 /* Then, solve L */ 1582 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1583 loTriFactorT->csrMat->num_rows, 1584 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1585 loTriFactorT->csrMat->num_entries, 1586 #endif 1587 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1588 loTriFactorT->csrMat->values->data().get(), 1589 loTriFactorT->csrMat->row_offsets->data().get(), 1590 loTriFactorT->csrMat->column_indices->data().get(), 1591 loTriFactorT->solveInfo, 1592 tempGPU->data().get(), 1593 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1594 xarray, 1595 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1596 #else 1597 xarray);CHKERRCUSPARSE(stat); 1598 #endif 1599 1600 /* restore */ 1601 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1602 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1603 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1604 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1605 PetscFunctionReturn(0); 1606 } 1607 1608 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1609 { 1610 const PetscScalar *barray; 1611 PetscScalar *xarray; 1612 thrust::device_ptr<const PetscScalar> bGPU; 1613 thrust::device_ptr<PetscScalar> xGPU; 1614 cusparseStatus_t stat; 1615 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1616 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1617 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1618 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1619 PetscErrorCode ierr; 1620 1621 PetscFunctionBegin; 1622 1623 /* Get the GPU pointers */ 1624 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1625 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1626 xGPU = thrust::device_pointer_cast(xarray); 1627 bGPU = thrust::device_pointer_cast(barray); 1628 1629 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1630 /* First, reorder with the row permutation */ 1631 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1632 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1633 tempGPU->begin()); 1634 1635 /* Next, solve L */ 1636 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1637 loTriFactor->csrMat->num_rows, 1638 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1639 loTriFactor->csrMat->num_entries, 1640 #endif 1641 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1642 loTriFactor->csrMat->values->data().get(), 1643 loTriFactor->csrMat->row_offsets->data().get(), 1644 loTriFactor->csrMat->column_indices->data().get(), 1645 loTriFactor->solveInfo, 1646 tempGPU->data().get(), 1647 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1648 xarray, 1649 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1650 #else 1651 xarray);CHKERRCUSPARSE(stat); 1652 #endif 1653 1654 /* Then, solve U */ 1655 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1656 upTriFactor->csrMat->num_rows, 1657 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1658 upTriFactor->csrMat->num_entries, 1659 #endif 1660 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1661 upTriFactor->csrMat->values->data().get(), 1662 upTriFactor->csrMat->row_offsets->data().get(), 1663 upTriFactor->csrMat->column_indices->data().get(), 1664 upTriFactor->solveInfo,xarray, 1665 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1666 tempGPU->data().get(), 1667 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1668 #else 1669 tempGPU->data().get());CHKERRCUSPARSE(stat); 1670 #endif 1671 1672 /* Last, reorder with the column permutation */ 1673 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1674 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1675 xGPU); 1676 1677 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1678 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1679 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1680 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1681 PetscFunctionReturn(0); 1682 } 1683 1684 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1685 { 1686 const PetscScalar *barray; 1687 PetscScalar *xarray; 1688 cusparseStatus_t stat; 1689 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1690 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1691 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1692 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1693 PetscErrorCode ierr; 1694 1695 PetscFunctionBegin; 1696 /* Get the GPU pointers */ 1697 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1698 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1699 1700 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1701 /* First, solve L */ 1702 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1703 loTriFactor->csrMat->num_rows, 1704 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1705 loTriFactor->csrMat->num_entries, 1706 #endif 1707 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1708 loTriFactor->csrMat->values->data().get(), 1709 loTriFactor->csrMat->row_offsets->data().get(), 1710 loTriFactor->csrMat->column_indices->data().get(), 1711 loTriFactor->solveInfo, 1712 barray, 1713 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1714 tempGPU->data().get(), 1715 loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1716 #else 1717 tempGPU->data().get());CHKERRCUSPARSE(stat); 1718 #endif 1719 1720 /* Next, solve U */ 1721 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1722 upTriFactor->csrMat->num_rows, 1723 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1724 upTriFactor->csrMat->num_entries, 1725 #endif 1726 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1727 upTriFactor->csrMat->values->data().get(), 1728 upTriFactor->csrMat->row_offsets->data().get(), 1729 upTriFactor->csrMat->column_indices->data().get(), 1730 upTriFactor->solveInfo, 1731 tempGPU->data().get(), 1732 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1733 xarray, 1734 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1735 #else 1736 xarray);CHKERRCUSPARSE(stat); 1737 #endif 1738 1739 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1740 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1741 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1742 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1743 PetscFunctionReturn(0); 1744 } 1745 1746 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1747 { 1748 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1749 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1750 cudaError_t cerr; 1751 PetscErrorCode ierr; 1752 1753 PetscFunctionBegin; 1754 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1755 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1756 1757 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1758 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1759 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1760 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1761 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1762 A->offloadmask = PETSC_OFFLOAD_BOTH; 1763 } 1764 PetscFunctionReturn(0); 1765 } 1766 1767 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1768 { 1769 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1770 PetscErrorCode ierr; 1771 1772 PetscFunctionBegin; 1773 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1774 *array = a->a; 1775 A->offloadmask = PETSC_OFFLOAD_CPU; 1776 PetscFunctionReturn(0); 1777 } 1778 1779 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1780 { 1781 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1782 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1783 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1784 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1785 PetscErrorCode ierr; 1786 cusparseStatus_t stat; 1787 PetscBool both = PETSC_TRUE; 1788 cudaError_t err; 1789 1790 PetscFunctionBegin; 1791 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1792 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1793 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1794 CsrMatrix *matrix; 1795 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1796 1797 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1798 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1799 matrix->values->assign(a->a, a->a+a->nz); 1800 err = WaitForCUDA();CHKERRCUDA(err); 1801 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1802 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1803 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1804 } else { 1805 PetscInt nnz; 1806 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1807 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1808 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1809 delete cusparsestruct->workVector; 1810 delete cusparsestruct->rowoffsets_gpu; 1811 cusparsestruct->workVector = NULL; 1812 cusparsestruct->rowoffsets_gpu = NULL; 1813 try { 1814 if (a->compressedrow.use) { 1815 m = a->compressedrow.nrows; 1816 ii = a->compressedrow.i; 1817 ridx = a->compressedrow.rindex; 1818 } else { 1819 m = A->rmap->n; 1820 ii = a->i; 1821 ridx = NULL; 1822 } 1823 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1824 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1825 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1826 else nnz = a->nz; 1827 1828 /* create cusparse matrix */ 1829 cusparsestruct->nrows = m; 1830 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1831 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1832 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1833 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1834 1835 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1836 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1837 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1838 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1839 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1840 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1841 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1842 1843 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1844 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1845 /* set the matrix */ 1846 CsrMatrix *mat= new CsrMatrix; 1847 mat->num_rows = m; 1848 mat->num_cols = A->cmap->n; 1849 mat->num_entries = nnz; 1850 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1851 mat->row_offsets->assign(ii, ii + m+1); 1852 1853 mat->column_indices = new THRUSTINTARRAY32(nnz); 1854 mat->column_indices->assign(a->j, a->j+nnz); 1855 1856 mat->values = new THRUSTARRAY(nnz); 1857 if (a->a) mat->values->assign(a->a, a->a+nnz); 1858 1859 /* assign the pointer */ 1860 matstruct->mat = mat; 1861 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1862 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1863 stat = cusparseCreateCsr(&matstruct->matDescr, 1864 mat->num_rows, mat->num_cols, mat->num_entries, 1865 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1866 mat->values->data().get(), 1867 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1868 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1869 } 1870 #endif 1871 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1872 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1873 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1874 #else 1875 CsrMatrix *mat= new CsrMatrix; 1876 mat->num_rows = m; 1877 mat->num_cols = A->cmap->n; 1878 mat->num_entries = nnz; 1879 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1880 mat->row_offsets->assign(ii, ii + m+1); 1881 1882 mat->column_indices = new THRUSTINTARRAY32(nnz); 1883 mat->column_indices->assign(a->j, a->j+nnz); 1884 1885 mat->values = new THRUSTARRAY(nnz); 1886 if (a->a) mat->values->assign(a->a, a->a+nnz); 1887 1888 cusparseHybMat_t hybMat; 1889 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1890 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1891 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1892 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1893 matstruct->descr, mat->values->data().get(), 1894 mat->row_offsets->data().get(), 1895 mat->column_indices->data().get(), 1896 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1897 /* assign the pointer */ 1898 matstruct->mat = hybMat; 1899 1900 if (mat) { 1901 if (mat->values) delete (THRUSTARRAY*)mat->values; 1902 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1903 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1904 delete (CsrMatrix*)mat; 1905 } 1906 #endif 1907 } 1908 1909 /* assign the compressed row indices */ 1910 if (a->compressedrow.use) { 1911 cusparsestruct->workVector = new THRUSTARRAY(m); 1912 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1913 matstruct->cprowIndices->assign(ridx,ridx+m); 1914 tmp = m; 1915 } else { 1916 cusparsestruct->workVector = NULL; 1917 matstruct->cprowIndices = NULL; 1918 tmp = 0; 1919 } 1920 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1921 1922 /* assign the pointer */ 1923 cusparsestruct->mat = matstruct; 1924 } catch(char *ex) { 1925 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1926 } 1927 err = WaitForCUDA();CHKERRCUDA(err); 1928 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1929 cusparsestruct->nonzerostate = A->nonzerostate; 1930 } 1931 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1932 } 1933 PetscFunctionReturn(0); 1934 } 1935 1936 struct VecCUDAPlusEquals 1937 { 1938 template <typename Tuple> 1939 __host__ __device__ 1940 void operator()(Tuple t) 1941 { 1942 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1943 } 1944 }; 1945 1946 struct VecCUDAEquals 1947 { 1948 template <typename Tuple> 1949 __host__ __device__ 1950 void operator()(Tuple t) 1951 { 1952 thrust::get<1>(t) = thrust::get<0>(t); 1953 } 1954 }; 1955 1956 struct VecCUDAEqualsReverse 1957 { 1958 template <typename Tuple> 1959 __host__ __device__ 1960 void operator()(Tuple t) 1961 { 1962 thrust::get<0>(t) = thrust::get<1>(t); 1963 } 1964 }; 1965 1966 struct MatMatCusparse { 1967 PetscBool cisdense; 1968 PetscScalar *Bt; 1969 Mat X; 1970 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1971 PetscLogDouble flops; 1972 CsrMatrix *Bcsr; 1973 1974 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1975 cusparseSpMatDescr_t matSpBDescr; 1976 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1977 cusparseDnMatDescr_t matBDescr; 1978 cusparseDnMatDescr_t matCDescr; 1979 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1980 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 1981 void *dBuffer4; 1982 void *dBuffer5; 1983 #endif 1984 size_t mmBufferSize; 1985 void *mmBuffer; 1986 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1987 cusparseSpGEMMDescr_t spgemmDesc; 1988 #endif 1989 }; 1990 1991 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1992 { 1993 PetscErrorCode ierr; 1994 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1995 cudaError_t cerr; 1996 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1997 cusparseStatus_t stat; 1998 #endif 1999 2000 PetscFunctionBegin; 2001 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 2002 delete mmdata->Bcsr; 2003 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2004 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 2005 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 2006 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 2007 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 2008 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2009 if (mmdata->dBuffer4) { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); } 2010 if (mmdata->dBuffer5) { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); } 2011 #endif 2012 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 2013 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 2014 #endif 2015 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 2016 ierr = PetscFree(data);CHKERRQ(ierr); 2017 PetscFunctionReturn(0); 2018 } 2019 2020 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2021 2022 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2023 { 2024 Mat_Product *product = C->product; 2025 Mat A,B; 2026 PetscInt m,n,blda,clda; 2027 PetscBool flg,biscuda; 2028 Mat_SeqAIJCUSPARSE *cusp; 2029 cusparseStatus_t stat; 2030 cusparseOperation_t opA; 2031 const PetscScalar *barray; 2032 PetscScalar *carray; 2033 PetscErrorCode ierr; 2034 MatMatCusparse *mmdata; 2035 Mat_SeqAIJCUSPARSEMultStruct *mat; 2036 CsrMatrix *csrmat; 2037 2038 PetscFunctionBegin; 2039 MatCheckProduct(C,1); 2040 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2041 mmdata = (MatMatCusparse*)product->data; 2042 A = product->A; 2043 B = product->B; 2044 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2045 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2046 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2047 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2048 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2049 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2050 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2051 switch (product->type) { 2052 case MATPRODUCT_AB: 2053 case MATPRODUCT_PtAP: 2054 mat = cusp->mat; 2055 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2056 m = A->rmap->n; 2057 n = B->cmap->n; 2058 break; 2059 case MATPRODUCT_AtB: 2060 if (!A->form_explicit_transpose) { 2061 mat = cusp->mat; 2062 opA = CUSPARSE_OPERATION_TRANSPOSE; 2063 } else { 2064 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2065 mat = cusp->matTranspose; 2066 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2067 } 2068 m = A->cmap->n; 2069 n = B->cmap->n; 2070 break; 2071 case MATPRODUCT_ABt: 2072 case MATPRODUCT_RARt: 2073 mat = cusp->mat; 2074 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2075 m = A->rmap->n; 2076 n = B->rmap->n; 2077 break; 2078 default: 2079 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2080 } 2081 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2082 csrmat = (CsrMatrix*)mat->mat; 2083 /* if the user passed a CPU matrix, copy the data to the GPU */ 2084 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2085 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2086 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2087 2088 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2089 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2090 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2091 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2092 } else { 2093 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2094 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2095 } 2096 2097 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2098 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2099 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2100 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2101 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2102 size_t mmBufferSize; 2103 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2104 if (!mmdata->matBDescr) { 2105 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2106 mmdata->Blda = blda; 2107 } 2108 2109 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2110 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2111 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2112 mmdata->Clda = clda; 2113 } 2114 2115 if (!mat->matDescr) { 2116 stat = cusparseCreateCsr(&mat->matDescr, 2117 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2118 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2119 csrmat->values->data().get(), 2120 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2121 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2122 } 2123 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2124 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2125 mmdata->matCDescr,cusparse_scalartype, 2126 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2127 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2128 cudaError_t cerr; 2129 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2130 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2131 mmdata->mmBufferSize = mmBufferSize; 2132 } 2133 mmdata->initialized = PETSC_TRUE; 2134 } else { 2135 /* to be safe, always update pointers of the mats */ 2136 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2137 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2138 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2139 } 2140 2141 /* do cusparseSpMM, which supports transpose on B */ 2142 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2143 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2144 mmdata->matCDescr,cusparse_scalartype, 2145 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2146 #else 2147 PetscInt k; 2148 /* cusparseXcsrmm does not support transpose on B */ 2149 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2150 cublasHandle_t cublasv2handle; 2151 cublasStatus_t cerr; 2152 2153 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2154 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2155 B->cmap->n,B->rmap->n, 2156 &PETSC_CUSPARSE_ONE ,barray,blda, 2157 &PETSC_CUSPARSE_ZERO,barray,blda, 2158 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2159 blda = B->cmap->n; 2160 k = B->cmap->n; 2161 } else { 2162 k = B->rmap->n; 2163 } 2164 2165 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2166 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2167 csrmat->num_entries,mat->alpha_one,mat->descr, 2168 csrmat->values->data().get(), 2169 csrmat->row_offsets->data().get(), 2170 csrmat->column_indices->data().get(), 2171 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2172 carray,clda);CHKERRCUSPARSE(stat); 2173 #endif 2174 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2175 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2176 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2177 if (product->type == MATPRODUCT_RARt) { 2178 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2179 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2180 } else if (product->type == MATPRODUCT_PtAP) { 2181 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2182 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2183 } else { 2184 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2185 } 2186 if (mmdata->cisdense) { 2187 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2188 } 2189 if (!biscuda) { 2190 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2191 } 2192 PetscFunctionReturn(0); 2193 } 2194 2195 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2196 { 2197 Mat_Product *product = C->product; 2198 Mat A,B; 2199 PetscInt m,n; 2200 PetscBool cisdense,flg; 2201 PetscErrorCode ierr; 2202 MatMatCusparse *mmdata; 2203 Mat_SeqAIJCUSPARSE *cusp; 2204 2205 PetscFunctionBegin; 2206 MatCheckProduct(C,1); 2207 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2208 A = product->A; 2209 B = product->B; 2210 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2211 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2212 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2213 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2214 switch (product->type) { 2215 case MATPRODUCT_AB: 2216 m = A->rmap->n; 2217 n = B->cmap->n; 2218 break; 2219 case MATPRODUCT_AtB: 2220 m = A->cmap->n; 2221 n = B->cmap->n; 2222 break; 2223 case MATPRODUCT_ABt: 2224 m = A->rmap->n; 2225 n = B->rmap->n; 2226 break; 2227 case MATPRODUCT_PtAP: 2228 m = B->cmap->n; 2229 n = B->cmap->n; 2230 break; 2231 case MATPRODUCT_RARt: 2232 m = B->rmap->n; 2233 n = B->rmap->n; 2234 break; 2235 default: 2236 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2237 } 2238 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2239 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2240 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2241 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2242 2243 /* product data */ 2244 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2245 mmdata->cisdense = cisdense; 2246 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2247 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2248 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2249 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2250 } 2251 #endif 2252 /* for these products we need intermediate storage */ 2253 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2254 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2255 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2256 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2257 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2258 } else { 2259 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2260 } 2261 } 2262 C->product->data = mmdata; 2263 C->product->destroy = MatDestroy_MatMatCusparse; 2264 2265 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2266 PetscFunctionReturn(0); 2267 } 2268 2269 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2270 { 2271 Mat_Product *product = C->product; 2272 Mat A,B; 2273 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2274 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2275 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2276 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2277 PetscBool flg; 2278 PetscErrorCode ierr; 2279 cusparseStatus_t stat; 2280 cudaError_t cerr; 2281 MatProductType ptype; 2282 MatMatCusparse *mmdata; 2283 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2284 cusparseSpMatDescr_t BmatSpDescr; 2285 #endif 2286 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2287 2288 PetscFunctionBegin; 2289 MatCheckProduct(C,1); 2290 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2291 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2292 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2293 mmdata = (MatMatCusparse*)C->product->data; 2294 A = product->A; 2295 B = product->B; 2296 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2297 mmdata->reusesym = PETSC_FALSE; 2298 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2299 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2300 Cmat = Ccusp->mat; 2301 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2302 Ccsr = (CsrMatrix*)Cmat->mat; 2303 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2304 goto finalize; 2305 } 2306 if (!c->nz) goto finalize; 2307 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2308 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2309 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2310 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2311 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2312 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2313 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2314 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2315 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2316 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2317 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2318 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2319 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2320 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2321 2322 ptype = product->type; 2323 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2324 ptype = MATPRODUCT_AB; 2325 if (!product->symbolic_used_the_fact_A_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 2326 } 2327 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2328 ptype = MATPRODUCT_AB; 2329 if (!product->symbolic_used_the_fact_B_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 2330 } 2331 switch (ptype) { 2332 case MATPRODUCT_AB: 2333 Amat = Acusp->mat; 2334 Bmat = Bcusp->mat; 2335 break; 2336 case MATPRODUCT_AtB: 2337 Amat = Acusp->matTranspose; 2338 Bmat = Bcusp->mat; 2339 break; 2340 case MATPRODUCT_ABt: 2341 Amat = Acusp->mat; 2342 Bmat = Bcusp->matTranspose; 2343 break; 2344 default: 2345 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2346 } 2347 Cmat = Ccusp->mat; 2348 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2349 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2350 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2351 Acsr = (CsrMatrix*)Amat->mat; 2352 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2353 Ccsr = (CsrMatrix*)Cmat->mat; 2354 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2355 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2356 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2357 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2358 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2359 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2360 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2361 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2362 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2363 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2364 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2365 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2366 #else 2367 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2368 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2369 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2370 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2371 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2372 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2373 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2374 #endif 2375 #else 2376 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2377 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2378 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2379 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2380 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2381 #endif 2382 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2383 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2384 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2385 C->offloadmask = PETSC_OFFLOAD_GPU; 2386 finalize: 2387 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2388 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2389 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2390 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2391 c->reallocs = 0; 2392 C->info.mallocs += 0; 2393 C->info.nz_unneeded = 0; 2394 C->assembled = C->was_assembled = PETSC_TRUE; 2395 C->num_ass++; 2396 PetscFunctionReturn(0); 2397 } 2398 2399 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2400 { 2401 Mat_Product *product = C->product; 2402 Mat A,B; 2403 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2404 Mat_SeqAIJ *a,*b,*c; 2405 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2406 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2407 PetscInt i,j,m,n,k; 2408 PetscBool flg; 2409 PetscErrorCode ierr; 2410 cusparseStatus_t stat; 2411 cudaError_t cerr; 2412 MatProductType ptype; 2413 MatMatCusparse *mmdata; 2414 PetscLogDouble flops; 2415 PetscBool biscompressed,ciscompressed; 2416 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2417 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2418 cusparseSpMatDescr_t BmatSpDescr; 2419 #else 2420 int cnz; 2421 #endif 2422 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2423 2424 PetscFunctionBegin; 2425 MatCheckProduct(C,1); 2426 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2427 A = product->A; 2428 B = product->B; 2429 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2430 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2431 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2432 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2433 a = (Mat_SeqAIJ*)A->data; 2434 b = (Mat_SeqAIJ*)B->data; 2435 /* product data */ 2436 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2437 C->product->data = mmdata; 2438 C->product->destroy = MatDestroy_MatMatCusparse; 2439 2440 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2441 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2442 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2443 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2444 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2445 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2446 2447 ptype = product->type; 2448 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2449 ptype = MATPRODUCT_AB; 2450 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2451 } 2452 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2453 ptype = MATPRODUCT_AB; 2454 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2455 } 2456 biscompressed = PETSC_FALSE; 2457 ciscompressed = PETSC_FALSE; 2458 switch (ptype) { 2459 case MATPRODUCT_AB: 2460 m = A->rmap->n; 2461 n = B->cmap->n; 2462 k = A->cmap->n; 2463 Amat = Acusp->mat; 2464 Bmat = Bcusp->mat; 2465 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2466 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2467 break; 2468 case MATPRODUCT_AtB: 2469 m = A->cmap->n; 2470 n = B->cmap->n; 2471 k = A->rmap->n; 2472 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2473 Amat = Acusp->matTranspose; 2474 Bmat = Bcusp->mat; 2475 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2476 break; 2477 case MATPRODUCT_ABt: 2478 m = A->rmap->n; 2479 n = B->rmap->n; 2480 k = A->cmap->n; 2481 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 2482 Amat = Acusp->mat; 2483 Bmat = Bcusp->matTranspose; 2484 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2485 break; 2486 default: 2487 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2488 } 2489 2490 /* create cusparse matrix */ 2491 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2492 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2493 c = (Mat_SeqAIJ*)C->data; 2494 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2495 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2496 Ccsr = new CsrMatrix; 2497 2498 c->compressedrow.use = ciscompressed; 2499 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2500 c->compressedrow.nrows = a->compressedrow.nrows; 2501 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2502 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2503 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2504 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2505 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2506 } else { 2507 c->compressedrow.nrows = 0; 2508 c->compressedrow.i = NULL; 2509 c->compressedrow.rindex = NULL; 2510 Ccusp->workVector = NULL; 2511 Cmat->cprowIndices = NULL; 2512 } 2513 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2514 Ccusp->mat = Cmat; 2515 Ccusp->mat->mat = Ccsr; 2516 Ccsr->num_rows = Ccusp->nrows; 2517 Ccsr->num_cols = n; 2518 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2519 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2520 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2521 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2522 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2523 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2524 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2525 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2526 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2527 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2528 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2529 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2530 c->nz = 0; 2531 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2532 Ccsr->values = new THRUSTARRAY(c->nz); 2533 goto finalizesym; 2534 } 2535 2536 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2537 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2538 Acsr = (CsrMatrix*)Amat->mat; 2539 if (!biscompressed) { 2540 Bcsr = (CsrMatrix*)Bmat->mat; 2541 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2542 BmatSpDescr = Bmat->matDescr; 2543 #endif 2544 } else { /* we need to use row offsets for the full matrix */ 2545 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2546 Bcsr = new CsrMatrix; 2547 Bcsr->num_rows = B->rmap->n; 2548 Bcsr->num_cols = cBcsr->num_cols; 2549 Bcsr->num_entries = cBcsr->num_entries; 2550 Bcsr->column_indices = cBcsr->column_indices; 2551 Bcsr->values = cBcsr->values; 2552 if (!Bcusp->rowoffsets_gpu) { 2553 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2554 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2555 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2556 } 2557 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2558 mmdata->Bcsr = Bcsr; 2559 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2560 if (Bcsr->num_rows && Bcsr->num_cols) { 2561 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2562 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2563 Bcsr->values->data().get(), 2564 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2565 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2566 } 2567 BmatSpDescr = mmdata->matSpBDescr; 2568 #endif 2569 } 2570 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2571 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2572 /* precompute flops count */ 2573 if (ptype == MATPRODUCT_AB) { 2574 for (i=0, flops = 0; i<A->rmap->n; i++) { 2575 const PetscInt st = a->i[i]; 2576 const PetscInt en = a->i[i+1]; 2577 for (j=st; j<en; j++) { 2578 const PetscInt brow = a->j[j]; 2579 flops += 2.*(b->i[brow+1] - b->i[brow]); 2580 } 2581 } 2582 } else if (ptype == MATPRODUCT_AtB) { 2583 for (i=0, flops = 0; i<A->rmap->n; i++) { 2584 const PetscInt anzi = a->i[i+1] - a->i[i]; 2585 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2586 flops += (2.*anzi)*bnzi; 2587 } 2588 } else { /* TODO */ 2589 flops = 0.; 2590 } 2591 2592 mmdata->flops = flops; 2593 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2594 2595 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2596 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2597 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2598 NULL, NULL, NULL, 2599 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2600 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2601 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2602 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2603 { 2604 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2605 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2606 */ 2607 void* dBuffer1 = NULL; 2608 void* dBuffer2 = NULL; 2609 void* dBuffer3 = NULL; 2610 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2611 size_t bufferSize1 = 0; 2612 size_t bufferSize2 = 0; 2613 size_t bufferSize3 = 0; 2614 size_t bufferSize4 = 0; 2615 size_t bufferSize5 = 0; 2616 2617 /*----------------------------------------------------------------------*/ 2618 /* ask bufferSize1 bytes for external memory */ 2619 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2620 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2621 &bufferSize1, NULL);CHKERRCUSPARSE(stat); 2622 cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr); 2623 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2624 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2625 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2626 &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat); 2627 2628 /*----------------------------------------------------------------------*/ 2629 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2630 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2631 &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat); 2632 cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr); 2633 cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr); 2634 cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr); 2635 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2636 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2637 &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat); 2638 cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr); 2639 cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr); 2640 2641 /*----------------------------------------------------------------------*/ 2642 /* get matrix C non-zero entries C_nnz1 */ 2643 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2644 c->nz = (PetscInt) C_nnz1; 2645 /* allocate matrix C */ 2646 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2647 Ccsr->values = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2648 /* update matC with the new pointers */ 2649 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2650 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2651 2652 /*----------------------------------------------------------------------*/ 2653 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2654 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2655 &bufferSize5, NULL);CHKERRCUSPARSE(stat); 2656 cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr); 2657 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2658 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2659 &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat); 2660 cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr); 2661 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2662 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2663 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2664 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2665 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr); 2666 } 2667 #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2668 size_t bufSize2; 2669 /* ask bufferSize bytes for external memory */ 2670 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2671 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2672 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2673 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2674 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2675 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2676 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2677 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2678 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2679 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2680 /* ask bufferSize again bytes for external memory */ 2681 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2682 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2683 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2684 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2685 /* The CUSPARSE documentation is not clear, nor the API 2686 We need both buffers to perform the operations properly! 2687 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2688 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2689 is stored in the descriptor! What a messy API... */ 2690 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2691 /* compute the intermediate product of A * B */ 2692 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2693 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2694 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2695 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2696 /* get matrix C non-zero entries C_nnz1 */ 2697 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2698 c->nz = (PetscInt) C_nnz1; 2699 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2700 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2701 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2702 Ccsr->values = new THRUSTARRAY(c->nz); 2703 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2704 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2705 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2706 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2707 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2708 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2709 #endif 2710 #else 2711 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2712 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2713 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2714 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2715 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2716 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2717 c->nz = cnz; 2718 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2719 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2720 Ccsr->values = new THRUSTARRAY(c->nz); 2721 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2722 2723 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2724 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2725 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2726 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2727 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2728 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2729 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2730 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2731 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2732 #endif 2733 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2734 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2735 finalizesym: 2736 c->singlemalloc = PETSC_FALSE; 2737 c->free_a = PETSC_TRUE; 2738 c->free_ij = PETSC_TRUE; 2739 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2740 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2741 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2742 PetscInt *d_i = c->i; 2743 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2744 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2745 ii = *Ccsr->row_offsets; 2746 jj = *Ccsr->column_indices; 2747 if (ciscompressed) d_i = c->compressedrow.i; 2748 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2749 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2750 } else { 2751 PetscInt *d_i = c->i; 2752 if (ciscompressed) d_i = c->compressedrow.i; 2753 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2754 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2755 } 2756 if (ciscompressed) { /* need to expand host row offsets */ 2757 PetscInt r = 0; 2758 c->i[0] = 0; 2759 for (k = 0; k < c->compressedrow.nrows; k++) { 2760 const PetscInt next = c->compressedrow.rindex[k]; 2761 const PetscInt old = c->compressedrow.i[k]; 2762 for (; r < next; r++) c->i[r+1] = old; 2763 } 2764 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2765 } 2766 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2767 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2768 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2769 c->maxnz = c->nz; 2770 c->nonzerorowcnt = 0; 2771 c->rmax = 0; 2772 for (k = 0; k < m; k++) { 2773 const PetscInt nn = c->i[k+1] - c->i[k]; 2774 c->ilen[k] = c->imax[k] = nn; 2775 c->nonzerorowcnt += (PetscInt)!!nn; 2776 c->rmax = PetscMax(c->rmax,nn); 2777 } 2778 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2779 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2780 Ccsr->num_entries = c->nz; 2781 2782 C->nonzerostate++; 2783 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2784 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2785 Ccusp->nonzerostate = C->nonzerostate; 2786 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2787 C->preallocated = PETSC_TRUE; 2788 C->assembled = PETSC_FALSE; 2789 C->was_assembled = PETSC_FALSE; 2790 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2791 mmdata->reusesym = PETSC_TRUE; 2792 C->offloadmask = PETSC_OFFLOAD_GPU; 2793 } 2794 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2795 PetscFunctionReturn(0); 2796 } 2797 2798 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2799 2800 /* handles sparse or dense B */ 2801 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2802 { 2803 Mat_Product *product = mat->product; 2804 PetscErrorCode ierr; 2805 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2806 2807 PetscFunctionBegin; 2808 MatCheckProduct(mat,1); 2809 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2810 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2811 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2812 } 2813 if (product->type == MATPRODUCT_ABC) { 2814 Ciscusp = PETSC_FALSE; 2815 if (!product->C->boundtocpu) { 2816 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2817 } 2818 } 2819 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 2820 PetscBool usecpu = PETSC_FALSE; 2821 switch (product->type) { 2822 case MATPRODUCT_AB: 2823 if (product->api_user) { 2824 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 2825 ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2826 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2827 } else { 2828 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 2829 ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2830 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2831 } 2832 break; 2833 case MATPRODUCT_AtB: 2834 if (product->api_user) { 2835 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 2836 ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2837 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2838 } else { 2839 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 2840 ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2841 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2842 } 2843 break; 2844 case MATPRODUCT_PtAP: 2845 if (product->api_user) { 2846 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 2847 ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2848 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2849 } else { 2850 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 2851 ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2852 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2853 } 2854 break; 2855 case MATPRODUCT_RARt: 2856 if (product->api_user) { 2857 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 2858 ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2859 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2860 } else { 2861 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 2862 ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2863 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2864 } 2865 break; 2866 case MATPRODUCT_ABC: 2867 if (product->api_user) { 2868 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 2869 ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2870 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2871 } else { 2872 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 2873 ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2874 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2875 } 2876 break; 2877 default: 2878 break; 2879 } 2880 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 2881 } 2882 /* dispatch */ 2883 if (isdense) { 2884 switch (product->type) { 2885 case MATPRODUCT_AB: 2886 case MATPRODUCT_AtB: 2887 case MATPRODUCT_ABt: 2888 case MATPRODUCT_PtAP: 2889 case MATPRODUCT_RARt: 2890 if (product->A->boundtocpu) { 2891 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2892 } else { 2893 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2894 } 2895 break; 2896 case MATPRODUCT_ABC: 2897 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2898 break; 2899 default: 2900 break; 2901 } 2902 } else if (Biscusp && Ciscusp) { 2903 switch (product->type) { 2904 case MATPRODUCT_AB: 2905 case MATPRODUCT_AtB: 2906 case MATPRODUCT_ABt: 2907 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2908 break; 2909 case MATPRODUCT_PtAP: 2910 case MATPRODUCT_RARt: 2911 case MATPRODUCT_ABC: 2912 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2913 break; 2914 default: 2915 break; 2916 } 2917 } else { /* fallback for AIJ */ 2918 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2919 } 2920 PetscFunctionReturn(0); 2921 } 2922 2923 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2924 { 2925 PetscErrorCode ierr; 2926 2927 PetscFunctionBegin; 2928 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2929 PetscFunctionReturn(0); 2930 } 2931 2932 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2933 { 2934 PetscErrorCode ierr; 2935 2936 PetscFunctionBegin; 2937 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2938 PetscFunctionReturn(0); 2939 } 2940 2941 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2942 { 2943 PetscErrorCode ierr; 2944 2945 PetscFunctionBegin; 2946 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2947 PetscFunctionReturn(0); 2948 } 2949 2950 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2951 { 2952 PetscErrorCode ierr; 2953 2954 PetscFunctionBegin; 2955 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2956 PetscFunctionReturn(0); 2957 } 2958 2959 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2960 { 2961 PetscErrorCode ierr; 2962 2963 PetscFunctionBegin; 2964 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2965 PetscFunctionReturn(0); 2966 } 2967 2968 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2969 { 2970 int i = blockIdx.x*blockDim.x + threadIdx.x; 2971 if (i < n) y[idx[i]] += x[i]; 2972 } 2973 2974 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2975 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2976 { 2977 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2978 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2979 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2980 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2981 PetscErrorCode ierr; 2982 cusparseStatus_t stat; 2983 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2984 PetscBool compressed; 2985 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2986 PetscInt nx,ny; 2987 #endif 2988 2989 PetscFunctionBegin; 2990 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 2991 if (!a->nonzerorowcnt) { 2992 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2993 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2994 PetscFunctionReturn(0); 2995 } 2996 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2997 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2998 if (!trans) { 2999 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3000 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3001 } else { 3002 if (herm || !A->form_explicit_transpose) { 3003 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3004 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3005 } else { 3006 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);} 3007 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3008 } 3009 } 3010 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3011 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3012 3013 try { 3014 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3015 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 3016 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 3017 3018 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3019 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3020 /* z = A x + beta y. 3021 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3022 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3023 */ 3024 xptr = xarray; 3025 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3026 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3027 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3028 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3029 allocated to accommodate different uses. So we get the length info directly from mat. 3030 */ 3031 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3032 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3033 nx = mat->num_cols; 3034 ny = mat->num_rows; 3035 } 3036 #endif 3037 } else { 3038 /* z = A^T x + beta y 3039 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3040 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3041 */ 3042 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3043 dptr = zarray; 3044 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3045 if (compressed) { /* Scatter x to work vector */ 3046 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3047 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3048 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3049 VecCUDAEqualsReverse()); 3050 } 3051 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3052 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3053 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3054 nx = mat->num_rows; 3055 ny = mat->num_cols; 3056 } 3057 #endif 3058 } 3059 3060 /* csr_spmv does y = alpha op(A) x + beta y */ 3061 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3062 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3063 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3064 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3065 cudaError_t cerr; 3066 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3067 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3068 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3069 matstruct->matDescr, 3070 matstruct->cuSpMV[opA].vecXDescr, beta, 3071 matstruct->cuSpMV[opA].vecYDescr, 3072 cusparse_scalartype, 3073 cusparsestruct->spmvAlg, 3074 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 3075 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 3076 3077 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3078 } else { 3079 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3080 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 3081 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 3082 } 3083 3084 stat = cusparseSpMV(cusparsestruct->handle, opA, 3085 matstruct->alpha_one, 3086 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3087 matstruct->cuSpMV[opA].vecXDescr, 3088 beta, 3089 matstruct->cuSpMV[opA].vecYDescr, 3090 cusparse_scalartype, 3091 cusparsestruct->spmvAlg, 3092 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 3093 #else 3094 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3095 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 3096 mat->num_rows, mat->num_cols, 3097 mat->num_entries, matstruct->alpha_one, matstruct->descr, 3098 mat->values->data().get(), mat->row_offsets->data().get(), 3099 mat->column_indices->data().get(), xptr, beta, 3100 dptr);CHKERRCUSPARSE(stat); 3101 #endif 3102 } else { 3103 if (cusparsestruct->nrows) { 3104 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3105 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3106 #else 3107 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3108 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 3109 matstruct->alpha_one, matstruct->descr, hybMat, 3110 xptr, beta, 3111 dptr);CHKERRCUSPARSE(stat); 3112 #endif 3113 } 3114 } 3115 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3116 3117 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3118 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3119 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3120 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 3121 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3122 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3123 } 3124 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3125 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 3126 } 3127 3128 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3129 if (compressed) { 3130 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3131 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3132 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3133 prevent that. So I just add a ScatterAdd kernel. 3134 */ 3135 #if 0 3136 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3137 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3138 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3139 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3140 VecCUDAPlusEquals()); 3141 #else 3142 PetscInt n = matstruct->cprowIndices->size(); 3143 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3144 #endif 3145 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3146 } 3147 } else { 3148 if (yy && yy != zz) { 3149 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3150 } 3151 } 3152 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3153 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 3154 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 3155 } catch(char *ex) { 3156 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3157 } 3158 if (yy) { 3159 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 3160 } else { 3161 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 3162 } 3163 PetscFunctionReturn(0); 3164 } 3165 3166 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3167 { 3168 PetscErrorCode ierr; 3169 3170 PetscFunctionBegin; 3171 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3172 PetscFunctionReturn(0); 3173 } 3174 3175 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3176 { 3177 PetscErrorCode ierr; 3178 PetscObjectState onnz = A->nonzerostate; 3179 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3180 3181 PetscFunctionBegin; 3182 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 3183 if (onnz != A->nonzerostate && cusp->deviceMat) { 3184 cudaError_t cerr; 3185 3186 ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 3187 cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr); 3188 cusp->deviceMat = NULL; 3189 } 3190 PetscFunctionReturn(0); 3191 } 3192 3193 /* --------------------------------------------------------------------------------*/ 3194 /*@ 3195 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3196 (the default parallel PETSc format). This matrix will ultimately pushed down 3197 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3198 assembly performance the user should preallocate the matrix storage by setting 3199 the parameter nz (or the array nnz). By setting these parameters accurately, 3200 performance during matrix assembly can be increased by more than a factor of 50. 3201 3202 Collective 3203 3204 Input Parameters: 3205 + comm - MPI communicator, set to PETSC_COMM_SELF 3206 . m - number of rows 3207 . n - number of columns 3208 . nz - number of nonzeros per row (same for all rows) 3209 - nnz - array containing the number of nonzeros in the various rows 3210 (possibly different for each row) or NULL 3211 3212 Output Parameter: 3213 . A - the matrix 3214 3215 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3216 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3217 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3218 3219 Notes: 3220 If nnz is given then nz is ignored 3221 3222 The AIJ format (also called the Yale sparse matrix format or 3223 compressed row storage), is fully compatible with standard Fortran 77 3224 storage. That is, the stored row and column indices can begin at 3225 either one (as in Fortran) or zero. See the users' manual for details. 3226 3227 Specify the preallocated storage with either nz or nnz (not both). 3228 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3229 allocation. For large problems you MUST preallocate memory or you 3230 will get TERRIBLE performance, see the users' manual chapter on matrices. 3231 3232 By default, this format uses inodes (identical nodes) when possible, to 3233 improve numerical efficiency of matrix-vector products and solves. We 3234 search for consecutive rows with the same nonzero structure, thereby 3235 reusing matrix information to achieve increased efficiency. 3236 3237 Level: intermediate 3238 3239 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3240 @*/ 3241 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3242 { 3243 PetscErrorCode ierr; 3244 3245 PetscFunctionBegin; 3246 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3247 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3248 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3249 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3250 PetscFunctionReturn(0); 3251 } 3252 3253 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3254 { 3255 PetscErrorCode ierr; 3256 3257 PetscFunctionBegin; 3258 if (A->factortype == MAT_FACTOR_NONE) { 3259 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3260 } else { 3261 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3262 } 3263 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3264 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3265 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3266 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3267 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3268 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3269 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3270 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3271 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 3272 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3273 PetscFunctionReturn(0); 3274 } 3275 3276 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3277 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3278 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3279 { 3280 PetscErrorCode ierr; 3281 3282 PetscFunctionBegin; 3283 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3284 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3285 PetscFunctionReturn(0); 3286 } 3287 3288 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3289 { 3290 PetscErrorCode ierr; 3291 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3292 Mat_SeqAIJCUSPARSE *cy; 3293 Mat_SeqAIJCUSPARSE *cx; 3294 PetscScalar *ay; 3295 const PetscScalar *ax; 3296 CsrMatrix *csry,*csrx; 3297 3298 PetscFunctionBegin; 3299 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3300 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3301 if (X->ops->axpy != Y->ops->axpy) { 3302 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3303 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3304 PetscFunctionReturn(0); 3305 } 3306 /* if we are here, it means both matrices are bound to GPU */ 3307 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3308 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3309 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3310 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3311 csry = (CsrMatrix*)cy->mat->mat; 3312 csrx = (CsrMatrix*)cx->mat->mat; 3313 /* see if we can turn this into a cublas axpy */ 3314 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3315 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3316 if (eq) { 3317 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3318 } 3319 if (eq) str = SAME_NONZERO_PATTERN; 3320 } 3321 /* spgeam is buggy with one column */ 3322 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3323 3324 if (str == SUBSET_NONZERO_PATTERN) { 3325 cusparseStatus_t stat; 3326 PetscScalar b = 1.0; 3327 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3328 size_t bufferSize; 3329 void *buffer; 3330 cudaError_t cerr; 3331 #endif 3332 3333 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3334 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3335 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3336 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3337 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3338 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3339 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3340 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3341 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3342 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3343 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3344 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3345 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3346 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3347 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3348 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3349 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3350 #else 3351 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3352 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3353 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3354 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3355 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3356 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3357 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3358 #endif 3359 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3360 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3361 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3362 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3363 } else if (str == SAME_NONZERO_PATTERN) { 3364 cublasHandle_t cublasv2handle; 3365 cublasStatus_t berr; 3366 PetscBLASInt one = 1, bnz = 1; 3367 3368 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3369 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3370 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3371 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3372 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3373 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3374 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3375 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3376 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3377 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3378 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3379 } else { 3380 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3381 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3382 } 3383 PetscFunctionReturn(0); 3384 } 3385 3386 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3387 { 3388 PetscErrorCode ierr; 3389 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3390 PetscScalar *ay; 3391 cublasHandle_t cublasv2handle; 3392 cublasStatus_t berr; 3393 PetscBLASInt one = 1, bnz = 1; 3394 3395 PetscFunctionBegin; 3396 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3397 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3398 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3399 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3400 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3401 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3402 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3403 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3404 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3405 PetscFunctionReturn(0); 3406 } 3407 3408 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3409 { 3410 PetscErrorCode ierr; 3411 PetscBool both = PETSC_FALSE; 3412 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3413 3414 PetscFunctionBegin; 3415 if (A->factortype == MAT_FACTOR_NONE) { 3416 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3417 if (spptr->mat) { 3418 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3419 if (matrix->values) { 3420 both = PETSC_TRUE; 3421 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3422 } 3423 } 3424 if (spptr->matTranspose) { 3425 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3426 if (matrix->values) { 3427 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3428 } 3429 } 3430 } 3431 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3432 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3433 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3434 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3435 else A->offloadmask = PETSC_OFFLOAD_CPU; 3436 PetscFunctionReturn(0); 3437 } 3438 3439 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3440 { 3441 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3442 PetscErrorCode ierr; 3443 3444 PetscFunctionBegin; 3445 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3446 if (flg) { 3447 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3448 3449 A->ops->scale = MatScale_SeqAIJ; 3450 A->ops->axpy = MatAXPY_SeqAIJ; 3451 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3452 A->ops->mult = MatMult_SeqAIJ; 3453 A->ops->multadd = MatMultAdd_SeqAIJ; 3454 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3455 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3456 A->ops->multhermitiantranspose = NULL; 3457 A->ops->multhermitiantransposeadd = NULL; 3458 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3459 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3460 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3461 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3462 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3463 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3464 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3465 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3466 } else { 3467 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3468 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3469 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3470 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3471 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3472 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3473 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3474 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3475 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3476 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3477 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3478 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3479 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3480 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3481 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3482 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3483 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3484 } 3485 A->boundtocpu = flg; 3486 a->inode.use = flg; 3487 PetscFunctionReturn(0); 3488 } 3489 3490 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3491 { 3492 PetscErrorCode ierr; 3493 cusparseStatus_t stat; 3494 Mat B; 3495 3496 PetscFunctionBegin; 3497 ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3498 if (reuse == MAT_INITIAL_MATRIX) { 3499 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3500 } else if (reuse == MAT_REUSE_MATRIX) { 3501 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3502 } 3503 B = *newmat; 3504 3505 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3506 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3507 3508 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3509 if (B->factortype == MAT_FACTOR_NONE) { 3510 Mat_SeqAIJCUSPARSE *spptr; 3511 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3512 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3513 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3514 spptr->format = MAT_CUSPARSE_CSR; 3515 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3516 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3517 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3518 #else 3519 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3520 #endif 3521 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3522 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3523 #endif 3524 B->spptr = spptr; 3525 } else { 3526 Mat_SeqAIJCUSPARSETriFactors *spptr; 3527 3528 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3529 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3530 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3531 B->spptr = spptr; 3532 } 3533 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3534 } 3535 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3536 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3537 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3538 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3539 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3540 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3541 3542 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3543 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3544 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3545 #if defined(PETSC_HAVE_HYPRE) 3546 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 3547 #endif 3548 PetscFunctionReturn(0); 3549 } 3550 3551 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3552 { 3553 PetscErrorCode ierr; 3554 3555 PetscFunctionBegin; 3556 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3557 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3558 PetscFunctionReturn(0); 3559 } 3560 3561 /*MC 3562 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3563 3564 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3565 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3566 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3567 3568 Options Database Keys: 3569 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3570 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3571 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3572 3573 Level: beginner 3574 3575 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3576 M*/ 3577 3578 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3579 3580 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3581 { 3582 PetscErrorCode ierr; 3583 3584 PetscFunctionBegin; 3585 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 3586 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3587 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3588 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3589 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3590 3591 PetscFunctionReturn(0); 3592 } 3593 3594 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3595 { 3596 PetscErrorCode ierr; 3597 cusparseStatus_t stat; 3598 3599 PetscFunctionBegin; 3600 if (*cusparsestruct) { 3601 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3602 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3603 delete (*cusparsestruct)->workVector; 3604 delete (*cusparsestruct)->rowoffsets_gpu; 3605 delete (*cusparsestruct)->cooPerm; 3606 delete (*cusparsestruct)->cooPerm_a; 3607 delete (*cusparsestruct)->csr2csc_i; 3608 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3609 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3610 } 3611 PetscFunctionReturn(0); 3612 } 3613 3614 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3615 { 3616 PetscFunctionBegin; 3617 if (*mat) { 3618 delete (*mat)->values; 3619 delete (*mat)->column_indices; 3620 delete (*mat)->row_offsets; 3621 delete *mat; 3622 *mat = 0; 3623 } 3624 PetscFunctionReturn(0); 3625 } 3626 3627 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3628 { 3629 cusparseStatus_t stat; 3630 PetscErrorCode ierr; 3631 3632 PetscFunctionBegin; 3633 if (*trifactor) { 3634 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3635 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3636 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3637 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3638 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3639 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3640 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3641 #endif 3642 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3643 } 3644 PetscFunctionReturn(0); 3645 } 3646 3647 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3648 { 3649 CsrMatrix *mat; 3650 cusparseStatus_t stat; 3651 cudaError_t err; 3652 3653 PetscFunctionBegin; 3654 if (*matstruct) { 3655 if ((*matstruct)->mat) { 3656 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3657 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3658 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3659 #else 3660 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3661 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3662 #endif 3663 } else { 3664 mat = (CsrMatrix*)(*matstruct)->mat; 3665 CsrMatrix_Destroy(&mat); 3666 } 3667 } 3668 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3669 delete (*matstruct)->cprowIndices; 3670 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3671 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3672 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3673 3674 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3675 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3676 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3677 for (int i=0; i<3; i++) { 3678 if (mdata->cuSpMV[i].initialized) { 3679 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3680 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3681 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3682 } 3683 } 3684 #endif 3685 delete *matstruct; 3686 *matstruct = NULL; 3687 } 3688 PetscFunctionReturn(0); 3689 } 3690 3691 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3692 { 3693 PetscErrorCode ierr; 3694 3695 PetscFunctionBegin; 3696 if (*trifactors) { 3697 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3698 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3699 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3700 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3701 delete (*trifactors)->rpermIndices; 3702 delete (*trifactors)->cpermIndices; 3703 delete (*trifactors)->workVector; 3704 (*trifactors)->rpermIndices = NULL; 3705 (*trifactors)->cpermIndices = NULL; 3706 (*trifactors)->workVector = NULL; 3707 if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3708 if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3709 (*trifactors)->init_dev_prop = PETSC_FALSE; 3710 } 3711 PetscFunctionReturn(0); 3712 } 3713 3714 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3715 { 3716 PetscErrorCode ierr; 3717 cusparseHandle_t handle; 3718 cusparseStatus_t stat; 3719 3720 PetscFunctionBegin; 3721 if (*trifactors) { 3722 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3723 if (handle = (*trifactors)->handle) { 3724 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3725 } 3726 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3727 } 3728 PetscFunctionReturn(0); 3729 } 3730 3731 struct IJCompare 3732 { 3733 __host__ __device__ 3734 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3735 { 3736 if (t1.get<0>() < t2.get<0>()) return true; 3737 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3738 return false; 3739 } 3740 }; 3741 3742 struct IJEqual 3743 { 3744 __host__ __device__ 3745 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3746 { 3747 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3748 return true; 3749 } 3750 }; 3751 3752 struct IJDiff 3753 { 3754 __host__ __device__ 3755 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3756 { 3757 return t1 == t2 ? 0 : 1; 3758 } 3759 }; 3760 3761 struct IJSum 3762 { 3763 __host__ __device__ 3764 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3765 { 3766 return t1||t2; 3767 } 3768 }; 3769 3770 #include <thrust/iterator/discard_iterator.h> 3771 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3772 { 3773 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3774 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3775 THRUSTARRAY *cooPerm_v = NULL; 3776 thrust::device_ptr<const PetscScalar> d_v; 3777 CsrMatrix *matrix; 3778 PetscErrorCode ierr; 3779 PetscInt n; 3780 3781 PetscFunctionBegin; 3782 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3783 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3784 if (!cusp->cooPerm) { 3785 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3786 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3787 PetscFunctionReturn(0); 3788 } 3789 matrix = (CsrMatrix*)cusp->mat->mat; 3790 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3791 if (!v) { 3792 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3793 goto finalize; 3794 } 3795 n = cusp->cooPerm->size(); 3796 if (isCudaMem(v)) { 3797 d_v = thrust::device_pointer_cast(v); 3798 } else { 3799 cooPerm_v = new THRUSTARRAY(n); 3800 cooPerm_v->assign(v,v+n); 3801 d_v = cooPerm_v->data(); 3802 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3803 } 3804 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3805 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3806 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3807 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3808 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3809 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3810 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3811 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3812 */ 3813 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3814 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3815 delete cooPerm_w; 3816 } else { 3817 /* all nonzeros in d_v[] are unique entries */ 3818 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3819 matrix->values->begin())); 3820 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3821 matrix->values->end())); 3822 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 3823 } 3824 } else { 3825 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3826 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3827 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3828 } else { 3829 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3830 matrix->values->begin())); 3831 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3832 matrix->values->end())); 3833 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3834 } 3835 } 3836 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3837 finalize: 3838 delete cooPerm_v; 3839 A->offloadmask = PETSC_OFFLOAD_GPU; 3840 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3841 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3842 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3843 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3844 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3845 a->reallocs = 0; 3846 A->info.mallocs += 0; 3847 A->info.nz_unneeded = 0; 3848 A->assembled = A->was_assembled = PETSC_TRUE; 3849 A->num_ass++; 3850 PetscFunctionReturn(0); 3851 } 3852 3853 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3854 { 3855 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3856 PetscErrorCode ierr; 3857 3858 PetscFunctionBegin; 3859 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3860 if (!cusp) PetscFunctionReturn(0); 3861 if (destroy) { 3862 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3863 delete cusp->csr2csc_i; 3864 cusp->csr2csc_i = NULL; 3865 } 3866 A->transupdated = PETSC_FALSE; 3867 PetscFunctionReturn(0); 3868 } 3869 3870 #include <thrust/binary_search.h> 3871 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3872 { 3873 PetscErrorCode ierr; 3874 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3875 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3876 PetscInt cooPerm_n, nzr = 0; 3877 cudaError_t cerr; 3878 3879 PetscFunctionBegin; 3880 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3881 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3882 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3883 if (n != cooPerm_n) { 3884 delete cusp->cooPerm; 3885 delete cusp->cooPerm_a; 3886 cusp->cooPerm = NULL; 3887 cusp->cooPerm_a = NULL; 3888 } 3889 if (n) { 3890 THRUSTINTARRAY d_i(n); 3891 THRUSTINTARRAY d_j(n); 3892 THRUSTINTARRAY ii(A->rmap->n); 3893 3894 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3895 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3896 3897 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3898 d_i.assign(coo_i,coo_i+n); 3899 d_j.assign(coo_j,coo_j+n); 3900 3901 /* Ex. 3902 n = 6 3903 coo_i = [3,3,1,4,1,4] 3904 coo_j = [3,2,2,5,2,6] 3905 */ 3906 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3907 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3908 3909 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3910 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3911 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 3912 *cusp->cooPerm_a = d_i; /* copy the sorted array */ 3913 THRUSTINTARRAY w = d_j; 3914 3915 /* 3916 d_i = [1,1,3,3,4,4] 3917 d_j = [2,2,2,3,5,6] 3918 cooPerm = [2,4,1,0,3,5] 3919 */ 3920 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 3921 3922 /* 3923 d_i = [1,3,3,4,4,x] 3924 ^ekey 3925 d_j = [2,2,3,5,6,x] 3926 ^nekye 3927 */ 3928 if (nekey == ekey) { /* all entries are unique */ 3929 delete cusp->cooPerm_a; 3930 cusp->cooPerm_a = NULL; 3931 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 3932 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 3933 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 3934 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 3935 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 3936 w[0] = 0; 3937 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 3938 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 3939 } 3940 thrust::counting_iterator<PetscInt> search_begin(0); 3941 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 3942 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 3943 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 3944 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3945 3946 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3947 a->singlemalloc = PETSC_FALSE; 3948 a->free_a = PETSC_TRUE; 3949 a->free_ij = PETSC_TRUE; 3950 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3951 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 3952 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3953 a->nz = a->maxnz = a->i[A->rmap->n]; 3954 a->rmax = 0; 3955 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3956 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3957 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3958 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3959 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3960 for (PetscInt i = 0; i < A->rmap->n; i++) { 3961 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3962 nzr += (PetscInt)!!(nnzr); 3963 a->ilen[i] = a->imax[i] = nnzr; 3964 a->rmax = PetscMax(a->rmax,nnzr); 3965 } 3966 a->nonzerorowcnt = nzr; 3967 A->preallocated = PETSC_TRUE; 3968 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3969 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3970 } else { 3971 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3972 } 3973 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3974 3975 /* We want to allocate the CUSPARSE struct for matvec now. 3976 The code is so convoluted now that I prefer to copy zeros */ 3977 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3978 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3979 A->offloadmask = PETSC_OFFLOAD_CPU; 3980 A->nonzerostate++; 3981 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3982 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 3983 3984 A->assembled = PETSC_FALSE; 3985 A->was_assembled = PETSC_FALSE; 3986 PetscFunctionReturn(0); 3987 } 3988 3989 /*@C 3990 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 3991 3992 Not collective 3993 3994 Input Parameters: 3995 + A - the matrix 3996 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 3997 3998 Output Parameters: 3999 + ia - the CSR row pointers 4000 - ja - the CSR column indices 4001 4002 Level: developer 4003 4004 Notes: 4005 When compressed is true, the CSR structure does not contain empty rows 4006 4007 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead() 4008 @*/ 4009 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4010 { 4011 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4012 CsrMatrix *csr; 4013 PetscErrorCode ierr; 4014 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4015 4016 PetscFunctionBegin; 4017 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4018 if (!i || !j) PetscFunctionReturn(0); 4019 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4020 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4021 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4022 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4023 csr = (CsrMatrix*)cusp->mat->mat; 4024 if (i) { 4025 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4026 if (!cusp->rowoffsets_gpu) { 4027 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4028 cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4029 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4030 } 4031 *i = cusp->rowoffsets_gpu->data().get(); 4032 } else *i = csr->row_offsets->data().get(); 4033 } 4034 if (j) *j = csr->column_indices->data().get(); 4035 PetscFunctionReturn(0); 4036 } 4037 4038 /*@C 4039 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 4040 4041 Not collective 4042 4043 Input Parameters: 4044 + A - the matrix 4045 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4046 4047 Output Parameters: 4048 + ia - the CSR row pointers 4049 - ja - the CSR column indices 4050 4051 Level: developer 4052 4053 .seealso: MatSeqAIJCUSPARSEGetIJ() 4054 @*/ 4055 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4056 { 4057 PetscFunctionBegin; 4058 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4059 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4060 if (i) *i = NULL; 4061 if (j) *j = NULL; 4062 PetscFunctionReturn(0); 4063 } 4064 4065 /*@C 4066 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4067 4068 Not Collective 4069 4070 Input Parameter: 4071 . A - a MATSEQAIJCUSPARSE matrix 4072 4073 Output Parameter: 4074 . a - pointer to the device data 4075 4076 Level: developer 4077 4078 Notes: may trigger host-device copies if up-to-date matrix data is on host 4079 4080 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead() 4081 @*/ 4082 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4083 { 4084 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4085 CsrMatrix *csr; 4086 PetscErrorCode ierr; 4087 4088 PetscFunctionBegin; 4089 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4090 PetscValidPointer(a,2); 4091 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4092 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4093 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4094 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4095 csr = (CsrMatrix*)cusp->mat->mat; 4096 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4097 *a = csr->values->data().get(); 4098 PetscFunctionReturn(0); 4099 } 4100 4101 /*@C 4102 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 4103 4104 Not Collective 4105 4106 Input Parameter: 4107 . A - a MATSEQAIJCUSPARSE matrix 4108 4109 Output Parameter: 4110 . a - pointer to the device data 4111 4112 Level: developer 4113 4114 .seealso: MatSeqAIJCUSPARSEGetArrayRead() 4115 @*/ 4116 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4117 { 4118 PetscFunctionBegin; 4119 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4120 PetscValidPointer(a,2); 4121 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4122 *a = NULL; 4123 PetscFunctionReturn(0); 4124 } 4125 4126 /*@C 4127 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4128 4129 Not Collective 4130 4131 Input Parameter: 4132 . A - a MATSEQAIJCUSPARSE matrix 4133 4134 Output Parameter: 4135 . a - pointer to the device data 4136 4137 Level: developer 4138 4139 Notes: may trigger host-device copies if up-to-date matrix data is on host 4140 4141 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray() 4142 @*/ 4143 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4144 { 4145 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4146 CsrMatrix *csr; 4147 PetscErrorCode ierr; 4148 4149 PetscFunctionBegin; 4150 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4151 PetscValidPointer(a,2); 4152 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4153 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4154 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4155 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4156 csr = (CsrMatrix*)cusp->mat->mat; 4157 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4158 *a = csr->values->data().get(); 4159 A->offloadmask = PETSC_OFFLOAD_GPU; 4160 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4161 PetscFunctionReturn(0); 4162 } 4163 /*@C 4164 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4165 4166 Not Collective 4167 4168 Input Parameter: 4169 . A - a MATSEQAIJCUSPARSE matrix 4170 4171 Output Parameter: 4172 . a - pointer to the device data 4173 4174 Level: developer 4175 4176 .seealso: MatSeqAIJCUSPARSEGetArray() 4177 @*/ 4178 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4179 { 4180 PetscErrorCode ierr; 4181 4182 PetscFunctionBegin; 4183 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4184 PetscValidPointer(a,2); 4185 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4186 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4187 *a = NULL; 4188 PetscFunctionReturn(0); 4189 } 4190 4191 /*@C 4192 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4193 4194 Not Collective 4195 4196 Input Parameter: 4197 . A - a MATSEQAIJCUSPARSE matrix 4198 4199 Output Parameter: 4200 . a - pointer to the device data 4201 4202 Level: developer 4203 4204 Notes: does not trigger host-device copies and flags data validity on the GPU 4205 4206 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite() 4207 @*/ 4208 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4209 { 4210 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4211 CsrMatrix *csr; 4212 PetscErrorCode ierr; 4213 4214 PetscFunctionBegin; 4215 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4216 PetscValidPointer(a,2); 4217 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4218 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4219 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4220 csr = (CsrMatrix*)cusp->mat->mat; 4221 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4222 *a = csr->values->data().get(); 4223 A->offloadmask = PETSC_OFFLOAD_GPU; 4224 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4225 PetscFunctionReturn(0); 4226 } 4227 4228 /*@C 4229 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 4230 4231 Not Collective 4232 4233 Input Parameter: 4234 . A - a MATSEQAIJCUSPARSE matrix 4235 4236 Output Parameter: 4237 . a - pointer to the device data 4238 4239 Level: developer 4240 4241 .seealso: MatSeqAIJCUSPARSEGetArrayWrite() 4242 @*/ 4243 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4244 { 4245 PetscErrorCode ierr; 4246 4247 PetscFunctionBegin; 4248 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4249 PetscValidPointer(a,2); 4250 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4251 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4252 *a = NULL; 4253 PetscFunctionReturn(0); 4254 } 4255 4256 struct IJCompare4 4257 { 4258 __host__ __device__ 4259 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4260 { 4261 if (t1.get<0>() < t2.get<0>()) return true; 4262 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4263 return false; 4264 } 4265 }; 4266 4267 struct Shift 4268 { 4269 int _shift; 4270 4271 Shift(int shift) : _shift(shift) {} 4272 __host__ __device__ 4273 inline int operator() (const int &c) 4274 { 4275 return c + _shift; 4276 } 4277 }; 4278 4279 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4280 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4281 { 4282 PetscErrorCode ierr; 4283 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4284 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4285 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4286 CsrMatrix *Acsr,*Bcsr,*Ccsr; 4287 PetscInt Annz,Bnnz; 4288 cusparseStatus_t stat; 4289 PetscInt i,m,n,zero = 0; 4290 cudaError_t cerr; 4291 4292 PetscFunctionBegin; 4293 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4294 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4295 PetscValidPointer(C,4); 4296 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4297 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 4298 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 4299 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 4300 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4301 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4302 if (reuse == MAT_INITIAL_MATRIX) { 4303 m = A->rmap->n; 4304 n = A->cmap->n + B->cmap->n; 4305 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 4306 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 4307 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 4308 c = (Mat_SeqAIJ*)(*C)->data; 4309 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4310 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4311 Ccsr = new CsrMatrix; 4312 Cmat->cprowIndices = NULL; 4313 c->compressedrow.use = PETSC_FALSE; 4314 c->compressedrow.nrows = 0; 4315 c->compressedrow.i = NULL; 4316 c->compressedrow.rindex = NULL; 4317 Ccusp->workVector = NULL; 4318 Ccusp->nrows = m; 4319 Ccusp->mat = Cmat; 4320 Ccusp->mat->mat = Ccsr; 4321 Ccsr->num_rows = m; 4322 Ccsr->num_cols = n; 4323 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 4324 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4325 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4326 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4327 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4328 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4329 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4330 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4331 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4332 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4333 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4334 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4335 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4336 4337 Acsr = (CsrMatrix*)Acusp->mat->mat; 4338 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4339 Annz = (PetscInt)Acsr->column_indices->size(); 4340 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4341 c->nz = Annz + Bnnz; 4342 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4343 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4344 Ccsr->values = new THRUSTARRAY(c->nz); 4345 Ccsr->num_entries = c->nz; 4346 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4347 if (c->nz) { 4348 auto Acoo = new THRUSTINTARRAY32(Annz); 4349 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4350 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4351 THRUSTINTARRAY32 *Aroff,*Broff; 4352 4353 if (a->compressedrow.use) { /* need full row offset */ 4354 if (!Acusp->rowoffsets_gpu) { 4355 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4356 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4357 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4358 } 4359 Aroff = Acusp->rowoffsets_gpu; 4360 } else Aroff = Acsr->row_offsets; 4361 if (b->compressedrow.use) { /* need full row offset */ 4362 if (!Bcusp->rowoffsets_gpu) { 4363 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4364 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4365 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4366 } 4367 Broff = Bcusp->rowoffsets_gpu; 4368 } else Broff = Bcsr->row_offsets; 4369 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4370 stat = cusparseXcsr2coo(Acusp->handle, 4371 Aroff->data().get(), 4372 Annz, 4373 m, 4374 Acoo->data().get(), 4375 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4376 stat = cusparseXcsr2coo(Bcusp->handle, 4377 Broff->data().get(), 4378 Bnnz, 4379 m, 4380 Bcoo->data().get(), 4381 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4382 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4383 auto Aperm = thrust::make_constant_iterator(1); 4384 auto Bperm = thrust::make_constant_iterator(0); 4385 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4386 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4387 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4388 #else 4389 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4390 auto Bcib = Bcsr->column_indices->begin(); 4391 auto Bcie = Bcsr->column_indices->end(); 4392 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4393 #endif 4394 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4395 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4396 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4397 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4398 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4399 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4400 auto p1 = Ccusp->cooPerm->begin(); 4401 auto p2 = Ccusp->cooPerm->begin(); 4402 thrust::advance(p2,Annz); 4403 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4404 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4405 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4406 #endif 4407 auto cci = thrust::make_counting_iterator(zero); 4408 auto cce = thrust::make_counting_iterator(c->nz); 4409 #if 0 //Errors on SUMMIT cuda 11.1.0 4410 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4411 #else 4412 auto pred = thrust::identity<int>(); 4413 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4414 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4415 #endif 4416 stat = cusparseXcoo2csr(Ccusp->handle, 4417 Ccoo->data().get(), 4418 c->nz, 4419 m, 4420 Ccsr->row_offsets->data().get(), 4421 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4422 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4423 delete wPerm; 4424 delete Acoo; 4425 delete Bcoo; 4426 delete Ccoo; 4427 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4428 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4429 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4430 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4431 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4432 #endif 4433 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4434 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 4435 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 4436 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4437 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4438 CsrMatrix *CcsrT = new CsrMatrix; 4439 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4440 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4441 4442 (*C)->form_explicit_transpose = PETSC_TRUE; 4443 (*C)->transupdated = PETSC_TRUE; 4444 Ccusp->rowoffsets_gpu = NULL; 4445 CmatT->cprowIndices = NULL; 4446 CmatT->mat = CcsrT; 4447 CcsrT->num_rows = n; 4448 CcsrT->num_cols = m; 4449 CcsrT->num_entries = c->nz; 4450 4451 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4452 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4453 CcsrT->values = new THRUSTARRAY(c->nz); 4454 4455 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4456 auto rT = CcsrT->row_offsets->begin(); 4457 if (AT) { 4458 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4459 thrust::advance(rT,-1); 4460 } 4461 if (BT) { 4462 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4463 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4464 thrust::copy(titb,tite,rT); 4465 } 4466 auto cT = CcsrT->column_indices->begin(); 4467 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4468 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4469 auto vT = CcsrT->values->begin(); 4470 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4471 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4472 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4473 4474 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4475 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4476 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4477 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4478 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4479 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4480 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4481 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4482 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4483 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4484 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4485 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4486 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4487 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4488 #endif 4489 Ccusp->matTranspose = CmatT; 4490 } 4491 } 4492 4493 c->singlemalloc = PETSC_FALSE; 4494 c->free_a = PETSC_TRUE; 4495 c->free_ij = PETSC_TRUE; 4496 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4497 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4498 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4499 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4500 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4501 ii = *Ccsr->row_offsets; 4502 jj = *Ccsr->column_indices; 4503 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4504 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4505 } else { 4506 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4507 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4508 } 4509 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4510 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4511 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4512 c->maxnz = c->nz; 4513 c->nonzerorowcnt = 0; 4514 c->rmax = 0; 4515 for (i = 0; i < m; i++) { 4516 const PetscInt nn = c->i[i+1] - c->i[i]; 4517 c->ilen[i] = c->imax[i] = nn; 4518 c->nonzerorowcnt += (PetscInt)!!nn; 4519 c->rmax = PetscMax(c->rmax,nn); 4520 } 4521 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4522 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4523 (*C)->nonzerostate++; 4524 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4525 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4526 Ccusp->nonzerostate = (*C)->nonzerostate; 4527 (*C)->preallocated = PETSC_TRUE; 4528 } else { 4529 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4530 c = (Mat_SeqAIJ*)(*C)->data; 4531 if (c->nz) { 4532 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4533 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4534 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4535 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4536 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4537 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4538 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4539 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4540 Acsr = (CsrMatrix*)Acusp->mat->mat; 4541 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4542 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4543 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4544 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4545 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4546 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4547 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4548 auto pmid = Ccusp->cooPerm->begin(); 4549 thrust::advance(pmid,Acsr->num_entries); 4550 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4551 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4552 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4553 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4554 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4555 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4556 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4557 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4558 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4559 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4560 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4561 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4562 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4563 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4564 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4565 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4566 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4567 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4568 auto vT = CcsrT->values->begin(); 4569 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4570 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4571 (*C)->transupdated = PETSC_TRUE; 4572 } 4573 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4574 } 4575 } 4576 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4577 (*C)->assembled = PETSC_TRUE; 4578 (*C)->was_assembled = PETSC_FALSE; 4579 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4580 PetscFunctionReturn(0); 4581 } 4582 4583 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4584 { 4585 PetscErrorCode ierr; 4586 bool dmem; 4587 const PetscScalar *av; 4588 cudaError_t cerr; 4589 4590 PetscFunctionBegin; 4591 dmem = isCudaMem(v); 4592 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4593 if (n && idx) { 4594 THRUSTINTARRAY widx(n); 4595 widx.assign(idx,idx+n); 4596 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4597 4598 THRUSTARRAY *w = NULL; 4599 thrust::device_ptr<PetscScalar> dv; 4600 if (dmem) { 4601 dv = thrust::device_pointer_cast(v); 4602 } else { 4603 w = new THRUSTARRAY(n); 4604 dv = w->data(); 4605 } 4606 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4607 4608 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4609 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4610 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4611 if (w) { 4612 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4613 } 4614 delete w; 4615 } else { 4616 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4617 } 4618 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4619 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4620 PetscFunctionReturn(0); 4621 } 4622