1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 68 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 69 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 74 75 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 76 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 78 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 80 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 81 82 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 83 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 84 85 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 86 { 87 cusparseStatus_t stat; 88 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 89 90 PetscFunctionBegin; 91 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 92 cusparsestruct->stream = stream; 93 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 94 PetscFunctionReturn(0); 95 } 96 97 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 98 { 99 cusparseStatus_t stat; 100 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 101 102 PetscFunctionBegin; 103 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 104 if (cusparsestruct->handle != handle) { 105 if (cusparsestruct->handle) { 106 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 107 } 108 cusparsestruct->handle = handle; 109 } 110 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 111 PetscFunctionReturn(0); 112 } 113 114 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 115 { 116 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 117 PetscBool flg; 118 PetscErrorCode ierr; 119 120 PetscFunctionBegin; 121 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 122 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 123 if (cusparsestruct->handle) cusparsestruct->handle = 0; 124 PetscFunctionReturn(0); 125 } 126 127 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 128 { 129 PetscFunctionBegin; 130 *type = MATSOLVERCUSPARSE; 131 PetscFunctionReturn(0); 132 } 133 134 /*MC 135 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 136 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 137 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 138 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 139 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 140 algorithms are not recommended. This class does NOT support direct solver operations. 141 142 Level: beginner 143 144 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 145 M*/ 146 147 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 148 { 149 PetscErrorCode ierr; 150 PetscInt n = A->rmap->n; 151 152 PetscFunctionBegin; 153 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 154 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 155 (*B)->factortype = ftype; 156 (*B)->useordering = PETSC_TRUE; 157 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 158 159 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 160 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 161 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 162 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 163 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 164 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 165 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 166 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 167 168 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 169 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 170 PetscFunctionReturn(0); 171 } 172 173 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 174 { 175 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 176 177 PetscFunctionBegin; 178 switch (op) { 179 case MAT_CUSPARSE_MULT: 180 cusparsestruct->format = format; 181 break; 182 case MAT_CUSPARSE_ALL: 183 cusparsestruct->format = format; 184 break; 185 default: 186 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 187 } 188 PetscFunctionReturn(0); 189 } 190 191 /*@ 192 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 193 operation. Only the MatMult operation can use different GPU storage formats 194 for MPIAIJCUSPARSE matrices. 195 Not Collective 196 197 Input Parameters: 198 + A - Matrix of type SEQAIJCUSPARSE 199 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 200 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 201 202 Output Parameter: 203 204 Level: intermediate 205 206 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 207 @*/ 208 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 209 { 210 PetscErrorCode ierr; 211 212 PetscFunctionBegin; 213 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 214 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 215 PetscFunctionReturn(0); 216 } 217 218 /*@ 219 MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose 220 221 Collective on mat 222 223 Input Parameters: 224 + A - Matrix of type SEQAIJCUSPARSE 225 - transgen - the boolean flag 226 227 Level: intermediate 228 229 .seealso: MATSEQAIJCUSPARSE 230 @*/ 231 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 232 { 233 PetscErrorCode ierr; 234 PetscBool flg; 235 236 PetscFunctionBegin; 237 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 238 ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 239 if (flg) { 240 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 241 242 if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 243 cusp->transgen = transgen; 244 if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 245 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 246 } 247 } 248 PetscFunctionReturn(0); 249 } 250 251 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 252 { 253 PetscErrorCode ierr; 254 MatCUSPARSEStorageFormat format; 255 PetscBool flg; 256 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 257 258 PetscFunctionBegin; 259 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 260 if (A->factortype == MAT_FACTOR_NONE) { 261 PetscBool transgen = cusparsestruct->transgen; 262 263 ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 264 if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);} 265 266 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 267 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 268 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 269 270 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 271 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 272 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 273 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 274 cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 275 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 276 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 277 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 278 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 279 280 cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 281 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 282 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 283 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 284 285 cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 286 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 287 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 288 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 289 #endif 290 } 291 ierr = PetscOptionsTail();CHKERRQ(ierr); 292 PetscFunctionReturn(0); 293 } 294 295 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 296 { 297 PetscErrorCode ierr; 298 299 PetscFunctionBegin; 300 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 301 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 302 PetscFunctionReturn(0); 303 } 304 305 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 306 { 307 PetscErrorCode ierr; 308 309 PetscFunctionBegin; 310 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 311 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 312 PetscFunctionReturn(0); 313 } 314 315 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 316 { 317 PetscErrorCode ierr; 318 319 PetscFunctionBegin; 320 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 321 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 322 PetscFunctionReturn(0); 323 } 324 325 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 326 { 327 PetscErrorCode ierr; 328 329 PetscFunctionBegin; 330 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 331 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 332 PetscFunctionReturn(0); 333 } 334 335 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 336 { 337 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 338 PetscInt n = A->rmap->n; 339 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 340 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 341 cusparseStatus_t stat; 342 const PetscInt *ai = a->i,*aj = a->j,*vi; 343 const MatScalar *aa = a->a,*v; 344 PetscInt *AiLo, *AjLo; 345 PetscScalar *AALo; 346 PetscInt i,nz, nzLower, offset, rowOffset; 347 PetscErrorCode ierr; 348 cudaError_t cerr; 349 350 PetscFunctionBegin; 351 if (!n) PetscFunctionReturn(0); 352 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 353 try { 354 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 355 nzLower=n+ai[n]-ai[1]; 356 357 /* Allocate Space for the lower triangular matrix */ 358 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 359 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 360 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 361 362 /* Fill the lower triangular matrix */ 363 AiLo[0] = (PetscInt) 0; 364 AiLo[n] = nzLower; 365 AjLo[0] = (PetscInt) 0; 366 AALo[0] = (MatScalar) 1.0; 367 v = aa; 368 vi = aj; 369 offset = 1; 370 rowOffset= 1; 371 for (i=1; i<n; i++) { 372 nz = ai[i+1] - ai[i]; 373 /* additional 1 for the term on the diagonal */ 374 AiLo[i] = rowOffset; 375 rowOffset += nz+1; 376 377 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 378 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 379 380 offset += nz; 381 AjLo[offset] = (PetscInt) i; 382 AALo[offset] = (MatScalar) 1.0; 383 offset += 1; 384 385 v += nz; 386 vi += nz; 387 } 388 389 /* allocate space for the triangular factor information */ 390 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 391 392 /* Create the matrix description */ 393 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 394 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 395 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 396 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 397 #else 398 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 399 #endif 400 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 401 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 402 403 /* set the operation */ 404 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 405 406 /* set the matrix */ 407 loTriFactor->csrMat = new CsrMatrix; 408 loTriFactor->csrMat->num_rows = n; 409 loTriFactor->csrMat->num_cols = n; 410 loTriFactor->csrMat->num_entries = nzLower; 411 412 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 413 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 414 415 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 416 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 417 418 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 419 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 420 421 /* Create the solve analysis information */ 422 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 423 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 424 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 425 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 426 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 427 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 428 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 429 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 430 #endif 431 432 /* perform the solve analysis */ 433 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 434 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 435 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 436 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 437 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 438 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 439 #endif 440 );CHKERRCUSPARSE(stat); 441 442 /* assign the pointer. Is this really necessary? */ 443 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 444 445 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 446 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 447 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 448 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 449 } catch(char *ex) { 450 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 451 } 452 } 453 PetscFunctionReturn(0); 454 } 455 456 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 457 { 458 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 459 PetscInt n = A->rmap->n; 460 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 461 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 462 cusparseStatus_t stat; 463 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 464 const MatScalar *aa = a->a,*v; 465 PetscInt *AiUp, *AjUp; 466 PetscScalar *AAUp; 467 PetscInt i,nz, nzUpper, offset; 468 PetscErrorCode ierr; 469 cudaError_t cerr; 470 471 PetscFunctionBegin; 472 if (!n) PetscFunctionReturn(0); 473 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 474 try { 475 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 476 nzUpper = adiag[0]-adiag[n]; 477 478 /* Allocate Space for the upper triangular matrix */ 479 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 480 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 481 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 482 483 /* Fill the upper triangular matrix */ 484 AiUp[0]=(PetscInt) 0; 485 AiUp[n]=nzUpper; 486 offset = nzUpper; 487 for (i=n-1; i>=0; i--) { 488 v = aa + adiag[i+1] + 1; 489 vi = aj + adiag[i+1] + 1; 490 491 /* number of elements NOT on the diagonal */ 492 nz = adiag[i] - adiag[i+1]-1; 493 494 /* decrement the offset */ 495 offset -= (nz+1); 496 497 /* first, set the diagonal elements */ 498 AjUp[offset] = (PetscInt) i; 499 AAUp[offset] = (MatScalar)1./v[nz]; 500 AiUp[i] = AiUp[i+1] - (nz+1); 501 502 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 503 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 504 } 505 506 /* allocate space for the triangular factor information */ 507 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 508 509 /* Create the matrix description */ 510 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 511 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 512 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 513 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 514 #else 515 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 516 #endif 517 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 518 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 519 520 /* set the operation */ 521 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 522 523 /* set the matrix */ 524 upTriFactor->csrMat = new CsrMatrix; 525 upTriFactor->csrMat->num_rows = n; 526 upTriFactor->csrMat->num_cols = n; 527 upTriFactor->csrMat->num_entries = nzUpper; 528 529 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 530 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 531 532 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 533 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 534 535 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 536 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 537 538 /* Create the solve analysis information */ 539 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 540 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 541 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 542 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 543 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 544 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 545 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 546 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 547 #endif 548 549 /* perform the solve analysis */ 550 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 551 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 552 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 553 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 554 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 555 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 556 #endif 557 );CHKERRCUSPARSE(stat); 558 559 /* assign the pointer. Is this really necessary? */ 560 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 561 562 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 563 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 564 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 565 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 566 } catch(char *ex) { 567 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 568 } 569 } 570 PetscFunctionReturn(0); 571 } 572 573 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 574 { 575 PetscErrorCode ierr; 576 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 577 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 578 IS isrow = a->row,iscol = a->icol; 579 PetscBool row_identity,col_identity; 580 const PetscInt *r,*c; 581 PetscInt n = A->rmap->n; 582 583 PetscFunctionBegin; 584 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 585 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 586 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 587 588 cusparseTriFactors->workVector = new THRUSTARRAY(n); 589 cusparseTriFactors->nnz=a->nz; 590 591 A->offloadmask = PETSC_OFFLOAD_BOTH; 592 /* lower triangular indices */ 593 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 594 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 595 if (!row_identity) { 596 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 597 cusparseTriFactors->rpermIndices->assign(r, r+n); 598 } 599 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 600 601 /* upper triangular indices */ 602 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 603 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 604 if (!col_identity) { 605 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 606 cusparseTriFactors->cpermIndices->assign(c, c+n); 607 } 608 609 if (!row_identity && !col_identity) { 610 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 611 } else if(!row_identity) { 612 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 613 } else if(!col_identity) { 614 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 615 } 616 617 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 618 PetscFunctionReturn(0); 619 } 620 621 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 622 { 623 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 624 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 625 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 626 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 627 cusparseStatus_t stat; 628 PetscErrorCode ierr; 629 cudaError_t cerr; 630 PetscInt *AiUp, *AjUp; 631 PetscScalar *AAUp; 632 PetscScalar *AALo; 633 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 634 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 635 const PetscInt *ai = b->i,*aj = b->j,*vj; 636 const MatScalar *aa = b->a,*v; 637 638 PetscFunctionBegin; 639 if (!n) PetscFunctionReturn(0); 640 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 641 try { 642 /* Allocate Space for the upper triangular matrix */ 643 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 644 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 645 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 646 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 647 648 /* Fill the upper triangular matrix */ 649 AiUp[0]=(PetscInt) 0; 650 AiUp[n]=nzUpper; 651 offset = 0; 652 for (i=0; i<n; i++) { 653 /* set the pointers */ 654 v = aa + ai[i]; 655 vj = aj + ai[i]; 656 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 657 658 /* first, set the diagonal elements */ 659 AjUp[offset] = (PetscInt) i; 660 AAUp[offset] = (MatScalar)1.0/v[nz]; 661 AiUp[i] = offset; 662 AALo[offset] = (MatScalar)1.0/v[nz]; 663 664 offset+=1; 665 if (nz>0) { 666 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 667 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 668 for (j=offset; j<offset+nz; j++) { 669 AAUp[j] = -AAUp[j]; 670 AALo[j] = AAUp[j]/v[nz]; 671 } 672 offset+=nz; 673 } 674 } 675 676 /* allocate space for the triangular factor information */ 677 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 678 679 /* Create the matrix description */ 680 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 681 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 682 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 683 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 684 #else 685 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 686 #endif 687 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 688 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 689 690 /* set the matrix */ 691 upTriFactor->csrMat = new CsrMatrix; 692 upTriFactor->csrMat->num_rows = A->rmap->n; 693 upTriFactor->csrMat->num_cols = A->cmap->n; 694 upTriFactor->csrMat->num_entries = a->nz; 695 696 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 697 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 698 699 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 700 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 701 702 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 703 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 704 705 /* set the operation */ 706 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 707 708 /* Create the solve analysis information */ 709 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 710 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 711 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 712 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 713 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 714 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 715 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 716 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 717 #endif 718 719 /* perform the solve analysis */ 720 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 721 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 722 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 723 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 724 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 725 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 726 #endif 727 );CHKERRCUSPARSE(stat); 728 729 /* assign the pointer. Is this really necessary? */ 730 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 731 732 /* allocate space for the triangular factor information */ 733 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 734 735 /* Create the matrix description */ 736 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 737 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 738 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 739 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 740 #else 741 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 742 #endif 743 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 744 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 745 746 /* set the operation */ 747 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 748 749 /* set the matrix */ 750 loTriFactor->csrMat = new CsrMatrix; 751 loTriFactor->csrMat->num_rows = A->rmap->n; 752 loTriFactor->csrMat->num_cols = A->cmap->n; 753 loTriFactor->csrMat->num_entries = a->nz; 754 755 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 756 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 757 758 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 759 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 760 761 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 762 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 763 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 764 765 /* Create the solve analysis information */ 766 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 767 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 768 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 769 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 770 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 771 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 772 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 773 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 774 #endif 775 776 /* perform the solve analysis */ 777 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 778 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 779 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 780 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 781 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 782 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 783 #endif 784 );CHKERRCUSPARSE(stat); 785 786 /* assign the pointer. Is this really necessary? */ 787 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 788 789 A->offloadmask = PETSC_OFFLOAD_BOTH; 790 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 791 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 792 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 793 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 794 } catch(char *ex) { 795 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 796 } 797 } 798 PetscFunctionReturn(0); 799 } 800 801 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 802 { 803 PetscErrorCode ierr; 804 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 805 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 806 IS ip = a->row; 807 const PetscInt *rip; 808 PetscBool perm_identity; 809 PetscInt n = A->rmap->n; 810 811 PetscFunctionBegin; 812 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 813 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 814 cusparseTriFactors->workVector = new THRUSTARRAY(n); 815 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 816 817 /* lower triangular indices */ 818 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 819 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 820 if (!perm_identity) { 821 IS iip; 822 const PetscInt *irip; 823 824 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 825 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 826 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 827 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 828 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 829 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 830 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 831 ierr = ISDestroy(&iip);CHKERRQ(ierr); 832 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 833 } 834 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 835 PetscFunctionReturn(0); 836 } 837 838 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 839 { 840 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 841 IS isrow = b->row,iscol = b->col; 842 PetscBool row_identity,col_identity; 843 PetscErrorCode ierr; 844 845 PetscFunctionBegin; 846 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 847 B->offloadmask = PETSC_OFFLOAD_CPU; 848 /* determine which version of MatSolve needs to be used. */ 849 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 850 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 851 if (row_identity && col_identity) { 852 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 853 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 854 B->ops->matsolve = NULL; 855 B->ops->matsolvetranspose = NULL; 856 } else { 857 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 858 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 859 B->ops->matsolve = NULL; 860 B->ops->matsolvetranspose = NULL; 861 } 862 863 /* get the triangular factors */ 864 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 865 PetscFunctionReturn(0); 866 } 867 868 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 869 { 870 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 871 IS ip = b->row; 872 PetscBool perm_identity; 873 PetscErrorCode ierr; 874 875 PetscFunctionBegin; 876 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 877 B->offloadmask = PETSC_OFFLOAD_CPU; 878 /* determine which version of MatSolve needs to be used. */ 879 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 880 if (perm_identity) { 881 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 882 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 883 B->ops->matsolve = NULL; 884 B->ops->matsolvetranspose = NULL; 885 } else { 886 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 887 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 888 B->ops->matsolve = NULL; 889 B->ops->matsolvetranspose = NULL; 890 } 891 892 /* get the triangular factors */ 893 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 894 PetscFunctionReturn(0); 895 } 896 897 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 898 { 899 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 900 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 901 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 902 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 903 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 904 cusparseStatus_t stat; 905 cusparseIndexBase_t indexBase; 906 cusparseMatrixType_t matrixType; 907 cusparseFillMode_t fillMode; 908 cusparseDiagType_t diagType; 909 910 PetscFunctionBegin; 911 912 /*********************************************/ 913 /* Now the Transpose of the Lower Tri Factor */ 914 /*********************************************/ 915 916 /* allocate space for the transpose of the lower triangular factor */ 917 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 918 919 /* set the matrix descriptors of the lower triangular factor */ 920 matrixType = cusparseGetMatType(loTriFactor->descr); 921 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 922 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 923 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 924 diagType = cusparseGetMatDiagType(loTriFactor->descr); 925 926 /* Create the matrix description */ 927 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 928 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 929 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 930 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 931 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 932 933 /* set the operation */ 934 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 935 936 /* allocate GPU space for the CSC of the lower triangular factor*/ 937 loTriFactorT->csrMat = new CsrMatrix; 938 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 939 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 940 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 941 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 942 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 943 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 944 945 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 946 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 947 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 948 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 949 loTriFactor->csrMat->values->data().get(), 950 loTriFactor->csrMat->row_offsets->data().get(), 951 loTriFactor->csrMat->column_indices->data().get(), 952 loTriFactorT->csrMat->values->data().get(), 953 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 954 CUSPARSE_ACTION_NUMERIC,indexBase, 955 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 956 cudaError_t cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 957 #endif 958 959 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 960 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 961 loTriFactor->csrMat->values->data().get(), 962 loTriFactor->csrMat->row_offsets->data().get(), 963 loTriFactor->csrMat->column_indices->data().get(), 964 loTriFactorT->csrMat->values->data().get(), 965 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 966 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 967 CUSPARSE_ACTION_NUMERIC, indexBase, 968 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 969 #else 970 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 971 CUSPARSE_ACTION_NUMERIC, indexBase 972 #endif 973 );CHKERRCUSPARSE(stat); 974 975 /* Create the solve analysis information */ 976 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 977 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 978 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 979 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 980 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 981 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 982 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 983 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 984 #endif 985 986 /* perform the solve analysis */ 987 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 988 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 989 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 990 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 991 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 992 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 993 #endif 994 );CHKERRCUSPARSE(stat); 995 996 /* assign the pointer. Is this really necessary? */ 997 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 998 999 /*********************************************/ 1000 /* Now the Transpose of the Upper Tri Factor */ 1001 /*********************************************/ 1002 1003 /* allocate space for the transpose of the upper triangular factor */ 1004 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 1005 1006 /* set the matrix descriptors of the upper triangular factor */ 1007 matrixType = cusparseGetMatType(upTriFactor->descr); 1008 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1009 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1010 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1011 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1012 1013 /* Create the matrix description */ 1014 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1015 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1016 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1017 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1018 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1019 1020 /* set the operation */ 1021 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1022 1023 /* allocate GPU space for the CSC of the upper triangular factor*/ 1024 upTriFactorT->csrMat = new CsrMatrix; 1025 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1026 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1027 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1028 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1029 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1030 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1031 1032 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1033 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1034 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1035 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1036 upTriFactor->csrMat->values->data().get(), 1037 upTriFactor->csrMat->row_offsets->data().get(), 1038 upTriFactor->csrMat->column_indices->data().get(), 1039 upTriFactorT->csrMat->values->data().get(), 1040 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1041 CUSPARSE_ACTION_NUMERIC,indexBase, 1042 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1043 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1044 #endif 1045 1046 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1047 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1048 upTriFactor->csrMat->values->data().get(), 1049 upTriFactor->csrMat->row_offsets->data().get(), 1050 upTriFactor->csrMat->column_indices->data().get(), 1051 upTriFactorT->csrMat->values->data().get(), 1052 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1053 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1054 CUSPARSE_ACTION_NUMERIC, indexBase, 1055 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1056 #else 1057 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1058 CUSPARSE_ACTION_NUMERIC, indexBase 1059 #endif 1060 );CHKERRCUSPARSE(stat); 1061 1062 /* Create the solve analysis information */ 1063 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1064 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1065 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1066 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1067 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1068 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1069 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1070 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1071 #endif 1072 1073 /* perform the solve analysis */ 1074 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1075 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1076 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1077 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1078 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1079 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1080 #endif 1081 );CHKERRCUSPARSE(stat); 1082 1083 /* assign the pointer. Is this really necessary? */ 1084 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1085 PetscFunctionReturn(0); 1086 } 1087 1088 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 1089 { 1090 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1091 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1092 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1093 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1094 cusparseStatus_t stat; 1095 cusparseIndexBase_t indexBase; 1096 cudaError_t err; 1097 PetscErrorCode ierr; 1098 1099 PetscFunctionBegin; 1100 if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0); 1101 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1102 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1103 /* create cusparse matrix */ 1104 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1105 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1106 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1107 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1108 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1109 1110 /* set alpha and beta */ 1111 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1112 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1113 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1114 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1115 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1116 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1117 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1118 1119 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1120 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1121 CsrMatrix *matrixT= new CsrMatrix; 1122 matrixT->num_rows = A->cmap->n; 1123 matrixT->num_cols = A->rmap->n; 1124 matrixT->num_entries = a->nz; 1125 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1126 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1127 matrixT->values = new THRUSTARRAY(a->nz); 1128 1129 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 1130 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1131 1132 /* compute the transpose, i.e. the CSC */ 1133 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1134 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1135 A->cmap->n, matrix->num_entries, 1136 matrix->values->data().get(), 1137 cusparsestruct->rowoffsets_gpu->data().get(), 1138 matrix->column_indices->data().get(), 1139 matrixT->values->data().get(), 1140 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1141 CUSPARSE_ACTION_NUMERIC,indexBase, 1142 cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1143 err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err); 1144 #endif 1145 1146 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1147 A->cmap->n, matrix->num_entries, 1148 matrix->values->data().get(), 1149 cusparsestruct->rowoffsets_gpu->data().get(), 1150 matrix->column_indices->data().get(), 1151 matrixT->values->data().get(), 1152 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1153 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1154 CUSPARSE_ACTION_NUMERIC,indexBase, 1155 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1156 #else 1157 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1158 CUSPARSE_ACTION_NUMERIC, indexBase 1159 #endif 1160 );CHKERRCUSPARSE(stat); 1161 matstructT->mat = matrixT; 1162 1163 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1164 stat = cusparseCreateCsr(&matstructT->matDescr, 1165 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1166 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1167 matrixT->values->data().get(), 1168 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1169 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1170 #endif 1171 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1172 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1173 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1174 #else 1175 CsrMatrix *temp = new CsrMatrix; 1176 CsrMatrix *tempT = new CsrMatrix; 1177 /* First convert HYB to CSR */ 1178 temp->num_rows = A->rmap->n; 1179 temp->num_cols = A->cmap->n; 1180 temp->num_entries = a->nz; 1181 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1182 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1183 temp->values = new THRUSTARRAY(a->nz); 1184 1185 stat = cusparse_hyb2csr(cusparsestruct->handle, 1186 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1187 temp->values->data().get(), 1188 temp->row_offsets->data().get(), 1189 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1190 1191 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1192 tempT->num_rows = A->rmap->n; 1193 tempT->num_cols = A->cmap->n; 1194 tempT->num_entries = a->nz; 1195 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1196 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1197 tempT->values = new THRUSTARRAY(a->nz); 1198 1199 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1200 temp->num_cols, temp->num_entries, 1201 temp->values->data().get(), 1202 temp->row_offsets->data().get(), 1203 temp->column_indices->data().get(), 1204 tempT->values->data().get(), 1205 tempT->column_indices->data().get(), 1206 tempT->row_offsets->data().get(), 1207 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1208 1209 /* Last, convert CSC to HYB */ 1210 cusparseHybMat_t hybMat; 1211 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1212 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1213 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1214 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1215 matstructT->descr, tempT->values->data().get(), 1216 tempT->row_offsets->data().get(), 1217 tempT->column_indices->data().get(), 1218 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1219 1220 /* assign the pointer */ 1221 matstructT->mat = hybMat; 1222 /* delete temporaries */ 1223 if (tempT) { 1224 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1225 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1226 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1227 delete (CsrMatrix*) tempT; 1228 } 1229 if (temp) { 1230 if (temp->values) delete (THRUSTARRAY*) temp->values; 1231 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1232 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1233 delete (CsrMatrix*) temp; 1234 } 1235 #endif 1236 } 1237 err = WaitForCUDA();CHKERRCUDA(err); 1238 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1239 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1240 /* the compressed row indices is not used for matTranspose */ 1241 matstructT->cprowIndices = NULL; 1242 /* assign the pointer */ 1243 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1244 PetscFunctionReturn(0); 1245 } 1246 1247 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1248 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1249 { 1250 PetscInt n = xx->map->n; 1251 const PetscScalar *barray; 1252 PetscScalar *xarray; 1253 thrust::device_ptr<const PetscScalar> bGPU; 1254 thrust::device_ptr<PetscScalar> xGPU; 1255 cusparseStatus_t stat; 1256 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1257 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1258 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1259 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1260 PetscErrorCode ierr; 1261 cudaError_t cerr; 1262 1263 PetscFunctionBegin; 1264 /* Analyze the matrix and create the transpose ... on the fly */ 1265 if (!loTriFactorT && !upTriFactorT) { 1266 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1267 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1268 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1269 } 1270 1271 /* Get the GPU pointers */ 1272 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1273 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1274 xGPU = thrust::device_pointer_cast(xarray); 1275 bGPU = thrust::device_pointer_cast(barray); 1276 1277 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1278 /* First, reorder with the row permutation */ 1279 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1280 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1281 xGPU); 1282 1283 /* First, solve U */ 1284 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1285 upTriFactorT->csrMat->num_rows, 1286 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1287 upTriFactorT->csrMat->num_entries, 1288 #endif 1289 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1290 upTriFactorT->csrMat->values->data().get(), 1291 upTriFactorT->csrMat->row_offsets->data().get(), 1292 upTriFactorT->csrMat->column_indices->data().get(), 1293 upTriFactorT->solveInfo, 1294 xarray, tempGPU->data().get() 1295 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1296 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1297 #endif 1298 );CHKERRCUSPARSE(stat); 1299 1300 /* Then, solve L */ 1301 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1302 loTriFactorT->csrMat->num_rows, 1303 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1304 loTriFactorT->csrMat->num_entries, 1305 #endif 1306 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1307 loTriFactorT->csrMat->values->data().get(), 1308 loTriFactorT->csrMat->row_offsets->data().get(), 1309 loTriFactorT->csrMat->column_indices->data().get(), 1310 loTriFactorT->solveInfo, 1311 tempGPU->data().get(), xarray 1312 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1313 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1314 #endif 1315 );CHKERRCUSPARSE(stat); 1316 1317 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1318 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1319 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1320 tempGPU->begin()); 1321 1322 /* Copy the temporary to the full solution. */ 1323 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1324 1325 /* restore */ 1326 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1327 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1328 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1329 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1330 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1331 PetscFunctionReturn(0); 1332 } 1333 1334 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1335 { 1336 const PetscScalar *barray; 1337 PetscScalar *xarray; 1338 cusparseStatus_t stat; 1339 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1340 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1341 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1342 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1343 PetscErrorCode ierr; 1344 cudaError_t cerr; 1345 1346 PetscFunctionBegin; 1347 /* Analyze the matrix and create the transpose ... on the fly */ 1348 if (!loTriFactorT && !upTriFactorT) { 1349 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1350 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1351 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1352 } 1353 1354 /* Get the GPU pointers */ 1355 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1356 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1357 1358 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1359 /* First, solve U */ 1360 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1361 upTriFactorT->csrMat->num_rows, 1362 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1363 upTriFactorT->csrMat->num_entries, 1364 #endif 1365 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1366 upTriFactorT->csrMat->values->data().get(), 1367 upTriFactorT->csrMat->row_offsets->data().get(), 1368 upTriFactorT->csrMat->column_indices->data().get(), 1369 upTriFactorT->solveInfo, 1370 barray, tempGPU->data().get() 1371 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1372 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1373 #endif 1374 );CHKERRCUSPARSE(stat); 1375 1376 /* Then, solve L */ 1377 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1378 loTriFactorT->csrMat->num_rows, 1379 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1380 loTriFactorT->csrMat->num_entries, 1381 #endif 1382 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1383 loTriFactorT->csrMat->values->data().get(), 1384 loTriFactorT->csrMat->row_offsets->data().get(), 1385 loTriFactorT->csrMat->column_indices->data().get(), 1386 loTriFactorT->solveInfo, 1387 tempGPU->data().get(), xarray 1388 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1389 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1390 #endif 1391 );CHKERRCUSPARSE(stat); 1392 1393 /* restore */ 1394 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1395 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1396 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1397 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1398 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1399 PetscFunctionReturn(0); 1400 } 1401 1402 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1403 { 1404 const PetscScalar *barray; 1405 PetscScalar *xarray; 1406 thrust::device_ptr<const PetscScalar> bGPU; 1407 thrust::device_ptr<PetscScalar> xGPU; 1408 cusparseStatus_t stat; 1409 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1410 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1411 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1412 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1413 PetscErrorCode ierr; 1414 cudaError_t cerr; 1415 1416 PetscFunctionBegin; 1417 1418 /* Get the GPU pointers */ 1419 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1420 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1421 xGPU = thrust::device_pointer_cast(xarray); 1422 bGPU = thrust::device_pointer_cast(barray); 1423 1424 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1425 /* First, reorder with the row permutation */ 1426 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1427 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1428 tempGPU->begin()); 1429 1430 /* Next, solve L */ 1431 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1432 loTriFactor->csrMat->num_rows, 1433 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1434 loTriFactor->csrMat->num_entries, 1435 #endif 1436 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1437 loTriFactor->csrMat->values->data().get(), 1438 loTriFactor->csrMat->row_offsets->data().get(), 1439 loTriFactor->csrMat->column_indices->data().get(), 1440 loTriFactor->solveInfo, 1441 tempGPU->data().get(), xarray 1442 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1443 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1444 #endif 1445 );CHKERRCUSPARSE(stat); 1446 1447 /* Then, solve U */ 1448 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1449 upTriFactor->csrMat->num_rows, 1450 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1451 upTriFactor->csrMat->num_entries, 1452 #endif 1453 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1454 upTriFactor->csrMat->values->data().get(), 1455 upTriFactor->csrMat->row_offsets->data().get(), 1456 upTriFactor->csrMat->column_indices->data().get(), 1457 upTriFactor->solveInfo, 1458 xarray, tempGPU->data().get() 1459 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1460 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1461 #endif 1462 );CHKERRCUSPARSE(stat); 1463 1464 /* Last, reorder with the column permutation */ 1465 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1466 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1467 xGPU); 1468 1469 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1470 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1471 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1472 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1473 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1474 PetscFunctionReturn(0); 1475 } 1476 1477 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1478 { 1479 const PetscScalar *barray; 1480 PetscScalar *xarray; 1481 cusparseStatus_t stat; 1482 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1483 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1484 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1485 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1486 PetscErrorCode ierr; 1487 cudaError_t cerr; 1488 1489 PetscFunctionBegin; 1490 /* Get the GPU pointers */ 1491 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1492 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1493 1494 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1495 /* First, solve L */ 1496 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1497 loTriFactor->csrMat->num_rows, 1498 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1499 loTriFactor->csrMat->num_entries, 1500 #endif 1501 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1502 loTriFactor->csrMat->values->data().get(), 1503 loTriFactor->csrMat->row_offsets->data().get(), 1504 loTriFactor->csrMat->column_indices->data().get(), 1505 loTriFactor->solveInfo, 1506 barray, tempGPU->data().get() 1507 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1508 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1509 #endif 1510 );CHKERRCUSPARSE(stat); 1511 1512 /* Next, solve U */ 1513 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1514 upTriFactor->csrMat->num_rows, 1515 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1516 upTriFactor->csrMat->num_entries, 1517 #endif 1518 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1519 upTriFactor->csrMat->values->data().get(), 1520 upTriFactor->csrMat->row_offsets->data().get(), 1521 upTriFactor->csrMat->column_indices->data().get(), 1522 upTriFactor->solveInfo, 1523 tempGPU->data().get(), xarray 1524 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1525 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1526 #endif 1527 );CHKERRCUSPARSE(stat); 1528 1529 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1530 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1531 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1532 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1533 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1534 PetscFunctionReturn(0); 1535 } 1536 1537 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1538 { 1539 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1540 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1541 cudaError_t cerr; 1542 PetscErrorCode ierr; 1543 1544 PetscFunctionBegin; 1545 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1546 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1547 1548 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1549 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1550 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1551 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1552 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1553 A->offloadmask = PETSC_OFFLOAD_BOTH; 1554 } 1555 PetscFunctionReturn(0); 1556 } 1557 1558 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1559 { 1560 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1561 PetscErrorCode ierr; 1562 1563 PetscFunctionBegin; 1564 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1565 *array = a->a; 1566 A->offloadmask = PETSC_OFFLOAD_CPU; 1567 PetscFunctionReturn(0); 1568 } 1569 1570 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1571 { 1572 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1573 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1574 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1575 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1576 PetscErrorCode ierr; 1577 cusparseStatus_t stat; 1578 cudaError_t err; 1579 1580 PetscFunctionBegin; 1581 if (A->boundtocpu) PetscFunctionReturn(0); 1582 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1583 if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1584 /* Copy values only */ 1585 CsrMatrix *matrix,*matrixT; 1586 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1587 1588 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1589 matrix->values->assign(a->a, a->a+a->nz); 1590 err = WaitForCUDA();CHKERRCUDA(err); 1591 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1592 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1593 1594 /* Update matT when it was built before */ 1595 if (cusparsestruct->matTranspose) { 1596 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1597 matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1598 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1599 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1600 A->cmap->n, matrix->num_entries, 1601 matrix->values->data().get(), 1602 cusparsestruct->rowoffsets_gpu->data().get(), 1603 matrix->column_indices->data().get(), 1604 matrixT->values->data().get(), 1605 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1606 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1607 CUSPARSE_ACTION_NUMERIC,indexBase, 1608 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1609 #else 1610 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1611 CUSPARSE_ACTION_NUMERIC, indexBase 1612 #endif 1613 );CHKERRCUSPARSE(stat); 1614 err = WaitForCUDA();CHKERRCUDA(err); 1615 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1616 } 1617 } else { 1618 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1619 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1620 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1621 delete cusparsestruct->workVector; 1622 delete cusparsestruct->rowoffsets_gpu; 1623 try { 1624 if (a->compressedrow.use) { 1625 m = a->compressedrow.nrows; 1626 ii = a->compressedrow.i; 1627 ridx = a->compressedrow.rindex; 1628 } else { 1629 m = A->rmap->n; 1630 ii = a->i; 1631 ridx = NULL; 1632 } 1633 cusparsestruct->nrows = m; 1634 1635 /* create cusparse matrix */ 1636 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1637 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1638 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1639 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1640 1641 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1642 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1643 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1644 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1645 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1646 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1647 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1648 1649 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1650 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1651 /* set the matrix */ 1652 CsrMatrix *mat= new CsrMatrix; 1653 mat->num_rows = m; 1654 mat->num_cols = A->cmap->n; 1655 mat->num_entries = a->nz; 1656 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1657 mat->row_offsets->assign(ii, ii + m+1); 1658 1659 mat->column_indices = new THRUSTINTARRAY32(a->nz); 1660 mat->column_indices->assign(a->j, a->j+a->nz); 1661 1662 mat->values = new THRUSTARRAY(a->nz); 1663 mat->values->assign(a->a, a->a+a->nz); 1664 1665 /* assign the pointer */ 1666 matstruct->mat = mat; 1667 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1668 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1669 stat = cusparseCreateCsr(&matstruct->matDescr, 1670 mat->num_rows, mat->num_cols, mat->num_entries, 1671 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1672 mat->values->data().get(), 1673 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1674 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1675 } 1676 #endif 1677 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1678 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1679 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1680 #else 1681 CsrMatrix *mat= new CsrMatrix; 1682 mat->num_rows = m; 1683 mat->num_cols = A->cmap->n; 1684 mat->num_entries = a->nz; 1685 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1686 mat->row_offsets->assign(ii, ii + m+1); 1687 1688 mat->column_indices = new THRUSTINTARRAY32(a->nz); 1689 mat->column_indices->assign(a->j, a->j+a->nz); 1690 1691 mat->values = new THRUSTARRAY(a->nz); 1692 mat->values->assign(a->a, a->a+a->nz); 1693 1694 cusparseHybMat_t hybMat; 1695 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1696 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1697 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1698 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1699 matstruct->descr, mat->values->data().get(), 1700 mat->row_offsets->data().get(), 1701 mat->column_indices->data().get(), 1702 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1703 /* assign the pointer */ 1704 matstruct->mat = hybMat; 1705 1706 if (mat) { 1707 if (mat->values) delete (THRUSTARRAY*)mat->values; 1708 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1709 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1710 delete (CsrMatrix*)mat; 1711 } 1712 #endif 1713 } 1714 1715 /* assign the compressed row indices */ 1716 if (a->compressedrow.use) { 1717 cusparsestruct->workVector = new THRUSTARRAY(m); 1718 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1719 matstruct->cprowIndices->assign(ridx,ridx+m); 1720 tmp = m; 1721 } else { 1722 cusparsestruct->workVector = NULL; 1723 matstruct->cprowIndices = NULL; 1724 tmp = 0; 1725 } 1726 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1727 1728 /* assign the pointer */ 1729 cusparsestruct->mat = matstruct; 1730 } catch(char *ex) { 1731 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1732 } 1733 err = WaitForCUDA();CHKERRCUDA(err); 1734 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1735 cusparsestruct->nonzerostate = A->nonzerostate; 1736 } 1737 A->offloadmask = PETSC_OFFLOAD_BOTH; 1738 } 1739 PetscFunctionReturn(0); 1740 } 1741 1742 struct VecCUDAPlusEquals 1743 { 1744 template <typename Tuple> 1745 __host__ __device__ 1746 void operator()(Tuple t) 1747 { 1748 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1749 } 1750 }; 1751 1752 struct VecCUDAEquals 1753 { 1754 template <typename Tuple> 1755 __host__ __device__ 1756 void operator()(Tuple t) 1757 { 1758 thrust::get<1>(t) = thrust::get<0>(t); 1759 } 1760 }; 1761 1762 struct VecCUDAEqualsReverse 1763 { 1764 template <typename Tuple> 1765 __host__ __device__ 1766 void operator()(Tuple t) 1767 { 1768 thrust::get<0>(t) = thrust::get<1>(t); 1769 } 1770 }; 1771 1772 struct MatMatCusparse { 1773 PetscBool cisdense; 1774 PetscScalar *Bt; 1775 Mat X; 1776 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1777 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1778 cusparseDnMatDescr_t matBDescr; 1779 cusparseDnMatDescr_t matCDescr; 1780 size_t spmmBufferSize; 1781 void *spmmBuffer; 1782 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1783 #endif 1784 }; 1785 1786 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1787 { 1788 PetscErrorCode ierr; 1789 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1790 cudaError_t cerr; 1791 1792 PetscFunctionBegin; 1793 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1794 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1795 cusparseStatus_t stat; 1796 if (mmdata->matBDescr) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);} 1797 if (mmdata->matCDescr) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);} 1798 if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 1799 #endif 1800 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1801 ierr = PetscFree(data);CHKERRQ(ierr); 1802 PetscFunctionReturn(0); 1803 } 1804 1805 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1806 1807 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1808 { 1809 Mat_Product *product = C->product; 1810 Mat A,B; 1811 PetscInt m,n,blda,clda; 1812 PetscBool flg,biscuda; 1813 Mat_SeqAIJCUSPARSE *cusp; 1814 cusparseStatus_t stat; 1815 cusparseOperation_t opA; 1816 const PetscScalar *barray; 1817 PetscScalar *carray; 1818 PetscErrorCode ierr; 1819 MatMatCusparse *mmdata; 1820 Mat_SeqAIJCUSPARSEMultStruct *mat; 1821 CsrMatrix *csrmat; 1822 cudaError_t cerr; 1823 1824 PetscFunctionBegin; 1825 MatCheckProduct(C,1); 1826 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1827 mmdata = (MatMatCusparse*)product->data; 1828 A = product->A; 1829 B = product->B; 1830 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1831 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1832 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1833 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1834 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1835 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1836 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1837 switch (product->type) { 1838 case MATPRODUCT_AB: 1839 case MATPRODUCT_PtAP: 1840 mat = cusp->mat; 1841 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1842 m = A->rmap->n; 1843 n = B->cmap->n; 1844 break; 1845 case MATPRODUCT_AtB: 1846 if (!cusp->transgen) { 1847 mat = cusp->mat; 1848 opA = CUSPARSE_OPERATION_TRANSPOSE; 1849 } else { 1850 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1851 mat = cusp->matTranspose; 1852 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1853 } 1854 m = A->cmap->n; 1855 n = B->cmap->n; 1856 break; 1857 case MATPRODUCT_ABt: 1858 case MATPRODUCT_RARt: 1859 mat = cusp->mat; 1860 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1861 m = A->rmap->n; 1862 n = B->rmap->n; 1863 break; 1864 default: 1865 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1866 } 1867 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1868 csrmat = (CsrMatrix*)mat->mat; 1869 /* if the user passed a CPU matrix, copy the data to the GPU */ 1870 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1871 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 1872 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 1873 1874 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 1875 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1876 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1877 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 1878 } else { 1879 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 1880 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 1881 } 1882 1883 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1884 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1885 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 1886 /* (re)allcoate spmmBuffer if not initialized or LDAs are different */ 1887 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 1888 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 1889 if (!mmdata->matBDescr) { 1890 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 1891 mmdata->Blda = blda; 1892 } 1893 1894 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 1895 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 1896 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 1897 mmdata->Clda = clda; 1898 } 1899 1900 if (!mat->matDescr) { 1901 stat = cusparseCreateCsr(&mat->matDescr, 1902 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 1903 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 1904 csrmat->values->data().get(), 1905 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1906 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1907 } 1908 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 1909 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 1910 mmdata->matCDescr,cusparse_scalartype, 1911 cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat); 1912 if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 1913 cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr); 1914 mmdata->initialized = PETSC_TRUE; 1915 } else { 1916 /* to be safe, always update pointers of the mats */ 1917 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 1918 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 1919 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 1920 } 1921 1922 /* do cusparseSpMM, which supports transpose on B */ 1923 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 1924 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 1925 mmdata->matCDescr,cusparse_scalartype, 1926 cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat); 1927 #else 1928 PetscInt k; 1929 /* cusparseXcsrmm does not support transpose on B */ 1930 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1931 cublasHandle_t cublasv2handle; 1932 cublasStatus_t cerr; 1933 1934 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 1935 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 1936 B->cmap->n,B->rmap->n, 1937 &PETSC_CUSPARSE_ONE ,barray,blda, 1938 &PETSC_CUSPARSE_ZERO,barray,blda, 1939 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 1940 blda = B->cmap->n; 1941 k = B->cmap->n; 1942 } else { 1943 k = B->rmap->n; 1944 } 1945 1946 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 1947 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 1948 csrmat->num_entries,mat->alpha_one,mat->descr, 1949 csrmat->values->data().get(), 1950 csrmat->row_offsets->data().get(), 1951 csrmat->column_indices->data().get(), 1952 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 1953 carray,clda);CHKERRCUSPARSE(stat); 1954 #endif 1955 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1956 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1957 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 1958 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 1959 if (product->type == MATPRODUCT_RARt) { 1960 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1961 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1962 } else if (product->type == MATPRODUCT_PtAP) { 1963 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1964 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1965 } else { 1966 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 1967 } 1968 if (mmdata->cisdense) { 1969 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 1970 } 1971 if (!biscuda) { 1972 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 1973 } 1974 PetscFunctionReturn(0); 1975 } 1976 1977 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1978 { 1979 Mat_Product *product = C->product; 1980 Mat A,B; 1981 PetscInt m,n; 1982 PetscBool cisdense,flg; 1983 PetscErrorCode ierr; 1984 MatMatCusparse *mmdata; 1985 Mat_SeqAIJCUSPARSE *cusp; 1986 1987 PetscFunctionBegin; 1988 MatCheckProduct(C,1); 1989 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 1990 A = product->A; 1991 B = product->B; 1992 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1993 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1994 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1995 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 1996 switch (product->type) { 1997 case MATPRODUCT_AB: 1998 m = A->rmap->n; 1999 n = B->cmap->n; 2000 break; 2001 case MATPRODUCT_AtB: 2002 m = A->cmap->n; 2003 n = B->cmap->n; 2004 break; 2005 case MATPRODUCT_ABt: 2006 m = A->rmap->n; 2007 n = B->rmap->n; 2008 break; 2009 case MATPRODUCT_PtAP: 2010 m = B->cmap->n; 2011 n = B->cmap->n; 2012 break; 2013 case MATPRODUCT_RARt: 2014 m = B->rmap->n; 2015 n = B->rmap->n; 2016 break; 2017 default: 2018 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2019 } 2020 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2021 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2022 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2023 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2024 2025 /* product data */ 2026 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2027 mmdata->cisdense = cisdense; 2028 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2029 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2030 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2031 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2032 } 2033 #endif 2034 /* for these products we need intermediate storage */ 2035 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2036 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2037 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2038 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2039 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2040 } else { 2041 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2042 } 2043 } 2044 C->product->data = mmdata; 2045 C->product->destroy = MatDestroy_MatMatCusparse; 2046 2047 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2048 PetscFunctionReturn(0); 2049 } 2050 2051 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2052 2053 /* handles dense B */ 2054 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C) 2055 { 2056 Mat_Product *product = C->product; 2057 PetscErrorCode ierr; 2058 2059 PetscFunctionBegin; 2060 MatCheckProduct(C,1); 2061 if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 2062 if (product->A->boundtocpu) { 2063 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr); 2064 PetscFunctionReturn(0); 2065 } 2066 switch (product->type) { 2067 case MATPRODUCT_AB: 2068 case MATPRODUCT_AtB: 2069 case MATPRODUCT_ABt: 2070 case MATPRODUCT_PtAP: 2071 case MATPRODUCT_RARt: 2072 C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2073 default: 2074 break; 2075 } 2076 PetscFunctionReturn(0); 2077 } 2078 2079 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2080 { 2081 PetscErrorCode ierr; 2082 2083 PetscFunctionBegin; 2084 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2085 PetscFunctionReturn(0); 2086 } 2087 2088 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2089 { 2090 PetscErrorCode ierr; 2091 2092 PetscFunctionBegin; 2093 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2094 PetscFunctionReturn(0); 2095 } 2096 2097 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2098 { 2099 PetscErrorCode ierr; 2100 2101 PetscFunctionBegin; 2102 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2103 PetscFunctionReturn(0); 2104 } 2105 2106 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2107 { 2108 PetscErrorCode ierr; 2109 2110 PetscFunctionBegin; 2111 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2112 PetscFunctionReturn(0); 2113 } 2114 2115 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2116 { 2117 PetscErrorCode ierr; 2118 2119 PetscFunctionBegin; 2120 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2121 PetscFunctionReturn(0); 2122 } 2123 2124 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2125 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2126 { 2127 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2128 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2129 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2130 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2131 PetscErrorCode ierr; 2132 cudaError_t cerr; 2133 cusparseStatus_t stat; 2134 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2135 PetscBool compressed; 2136 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2137 PetscInt nx,ny; 2138 #endif 2139 2140 PetscFunctionBegin; 2141 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2142 if (!a->nonzerorowcnt) { 2143 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2144 PetscFunctionReturn(0); 2145 } 2146 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2147 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2148 if (!trans) { 2149 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2150 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2151 } else { 2152 if (herm || !cusparsestruct->transgen) { 2153 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2154 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2155 } else { 2156 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);} 2157 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2158 } 2159 } 2160 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2161 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2162 2163 try { 2164 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2165 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2166 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2167 2168 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2169 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2170 /* z = A x + beta y. 2171 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2172 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2173 */ 2174 xptr = xarray; 2175 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2176 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2177 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2178 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2179 allocated to accommodate different uses. So we get the length info directly from mat. 2180 */ 2181 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2182 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2183 nx = mat->num_cols; 2184 ny = mat->num_rows; 2185 } 2186 #endif 2187 } else { 2188 /* z = A^T x + beta y 2189 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2190 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2191 */ 2192 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2193 dptr = zarray; 2194 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2195 if (compressed) { /* Scatter x to work vector */ 2196 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2197 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2198 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2199 VecCUDAEqualsReverse()); 2200 } 2201 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2202 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2203 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2204 nx = mat->num_rows; 2205 ny = mat->num_cols; 2206 } 2207 #endif 2208 } 2209 2210 /* csr_spmv does y = alpha op(A) x + beta y */ 2211 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2212 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2213 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2214 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2215 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2216 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2217 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2218 matstruct->matDescr, 2219 matstruct->cuSpMV[opA].vecXDescr, beta, 2220 matstruct->cuSpMV[opA].vecYDescr, 2221 cusparse_scalartype, 2222 cusparsestruct->spmvAlg, 2223 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2224 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2225 2226 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2227 } else { 2228 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2229 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2230 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2231 } 2232 2233 stat = cusparseSpMV(cusparsestruct->handle, opA, 2234 matstruct->alpha_one, 2235 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */ 2236 matstruct->cuSpMV[opA].vecXDescr, 2237 beta, 2238 matstruct->cuSpMV[opA].vecYDescr, 2239 cusparse_scalartype, 2240 cusparsestruct->spmvAlg, 2241 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2242 #else 2243 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2244 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2245 mat->num_rows, mat->num_cols, 2246 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2247 mat->values->data().get(), mat->row_offsets->data().get(), 2248 mat->column_indices->data().get(), xptr, beta, 2249 dptr);CHKERRCUSPARSE(stat); 2250 #endif 2251 } else { 2252 if (cusparsestruct->nrows) { 2253 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2254 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2255 #else 2256 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2257 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2258 matstruct->alpha_one, matstruct->descr, hybMat, 2259 xptr, beta, 2260 dptr);CHKERRCUSPARSE(stat); 2261 #endif 2262 } 2263 } 2264 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2265 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2266 2267 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2268 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2269 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2270 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2271 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2272 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2273 } 2274 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2275 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2276 } 2277 2278 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2279 if (compressed) { 2280 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2281 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2282 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2283 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2284 VecCUDAPlusEquals()); 2285 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2286 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2287 } 2288 } else { 2289 if (yy && yy != zz) { 2290 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2291 } 2292 } 2293 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2294 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2295 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2296 } catch(char *ex) { 2297 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2298 } 2299 if (yy) { 2300 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2301 } else { 2302 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2303 } 2304 PetscFunctionReturn(0); 2305 } 2306 2307 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2308 { 2309 PetscErrorCode ierr; 2310 2311 PetscFunctionBegin; 2312 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2313 PetscFunctionReturn(0); 2314 } 2315 2316 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2317 { 2318 PetscErrorCode ierr; 2319 PetscSplitCSRDataStructure *d_mat = NULL, h_mat; 2320 PetscBool is_seq = PETSC_TRUE; 2321 PetscInt nnz_state = A->nonzerostate; 2322 PetscFunctionBegin; 2323 if (A->factortype == MAT_FACTOR_NONE) { 2324 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2325 } 2326 if (d_mat) { 2327 cudaError_t err; 2328 ierr = PetscInfo(A,"Assemble device matrix\n");CHKERRQ(ierr); 2329 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2330 nnz_state = h_mat.nonzerostate; 2331 is_seq = h_mat.seq; 2332 } 2333 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2334 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2335 if (A->factortype == MAT_FACTOR_NONE && A->nonzerostate >= nnz_state && is_seq) { // assembled on CPU eventhough equiped for GPU 2336 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2337 } else if (nnz_state > A->nonzerostate) { 2338 A->offloadmask = PETSC_OFFLOAD_GPU; 2339 } 2340 2341 PetscFunctionReturn(0); 2342 } 2343 2344 /* --------------------------------------------------------------------------------*/ 2345 /*@ 2346 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2347 (the default parallel PETSc format). This matrix will ultimately pushed down 2348 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2349 assembly performance the user should preallocate the matrix storage by setting 2350 the parameter nz (or the array nnz). By setting these parameters accurately, 2351 performance during matrix assembly can be increased by more than a factor of 50. 2352 2353 Collective 2354 2355 Input Parameters: 2356 + comm - MPI communicator, set to PETSC_COMM_SELF 2357 . m - number of rows 2358 . n - number of columns 2359 . nz - number of nonzeros per row (same for all rows) 2360 - nnz - array containing the number of nonzeros in the various rows 2361 (possibly different for each row) or NULL 2362 2363 Output Parameter: 2364 . A - the matrix 2365 2366 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2367 MatXXXXSetPreallocation() paradgm instead of this routine directly. 2368 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 2369 2370 Notes: 2371 If nnz is given then nz is ignored 2372 2373 The AIJ format (also called the Yale sparse matrix format or 2374 compressed row storage), is fully compatible with standard Fortran 77 2375 storage. That is, the stored row and column indices can begin at 2376 either one (as in Fortran) or zero. See the users' manual for details. 2377 2378 Specify the preallocated storage with either nz or nnz (not both). 2379 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 2380 allocation. For large problems you MUST preallocate memory or you 2381 will get TERRIBLE performance, see the users' manual chapter on matrices. 2382 2383 By default, this format uses inodes (identical nodes) when possible, to 2384 improve numerical efficiency of matrix-vector products and solves. We 2385 search for consecutive rows with the same nonzero structure, thereby 2386 reusing matrix information to achieve increased efficiency. 2387 2388 Level: intermediate 2389 2390 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 2391 @*/ 2392 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 2393 { 2394 PetscErrorCode ierr; 2395 2396 PetscFunctionBegin; 2397 ierr = MatCreate(comm,A);CHKERRQ(ierr); 2398 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 2399 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2400 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 2401 PetscFunctionReturn(0); 2402 } 2403 2404 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 2405 { 2406 PetscErrorCode ierr; 2407 PetscSplitCSRDataStructure *d_mat = NULL; 2408 2409 PetscFunctionBegin; 2410 if (A->factortype == MAT_FACTOR_NONE) { 2411 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2412 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 2413 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 2414 } else { 2415 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 2416 } 2417 if (d_mat) { 2418 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2419 cudaError_t err; 2420 PetscSplitCSRDataStructure h_mat; 2421 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 2422 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2423 if (h_mat.seq) { 2424 if (a->compressedrow.use) { 2425 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 2426 } 2427 err = cudaFree(d_mat);CHKERRCUDA(err); 2428 } 2429 } 2430 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 2431 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2432 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 2433 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 2434 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 2435 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 2436 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 2437 PetscFunctionReturn(0); 2438 } 2439 2440 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 2441 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 2442 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 2443 { 2444 PetscErrorCode ierr; 2445 2446 PetscFunctionBegin; 2447 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 2448 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 2449 PetscFunctionReturn(0); 2450 } 2451 2452 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 2453 { 2454 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2455 PetscErrorCode ierr; 2456 2457 PetscFunctionBegin; 2458 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 2459 if (flg) { 2460 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 2461 2462 A->ops->mult = MatMult_SeqAIJ; 2463 A->ops->multadd = MatMultAdd_SeqAIJ; 2464 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 2465 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 2466 A->ops->multhermitiantranspose = NULL; 2467 A->ops->multhermitiantransposeadd = NULL; 2468 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2469 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 2470 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 2471 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 2472 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 2473 } else { 2474 A->ops->mult = MatMult_SeqAIJCUSPARSE; 2475 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 2476 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 2477 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 2478 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 2479 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 2480 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2481 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2482 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 2483 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 2484 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 2485 } 2486 A->boundtocpu = flg; 2487 a->inode.use = flg; 2488 PetscFunctionReturn(0); 2489 } 2490 2491 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 2492 { 2493 PetscSplitCSRDataStructure *d_mat = NULL; 2494 PetscErrorCode ierr; 2495 PetscBool both = PETSC_FALSE; 2496 2497 PetscFunctionBegin; 2498 if (A->factortype == MAT_FACTOR_NONE) { 2499 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 2500 if (spptr->mat) { 2501 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 2502 if (matrix->values) { 2503 both = PETSC_TRUE; 2504 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 2505 } 2506 } 2507 if (spptr->matTranspose) { 2508 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 2509 if (matrix->values) { 2510 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 2511 } 2512 } 2513 d_mat = spptr->deviceMat; 2514 } 2515 if (d_mat) { 2516 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2517 PetscInt n = A->rmap->n, nnz = a->i[n]; 2518 cudaError_t err; 2519 PetscScalar *vals; 2520 ierr = PetscInfo(A,"Zero device matrix\n");CHKERRQ(ierr); 2521 err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2522 err = cudaMemset( vals, 0, (nnz)*sizeof(PetscScalar));CHKERRCUDA(err); 2523 } 2524 ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 2525 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2526 2527 PetscFunctionReturn(0); 2528 } 2529 2530 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 2531 { 2532 PetscErrorCode ierr; 2533 cusparseStatus_t stat; 2534 Mat B; 2535 2536 PetscFunctionBegin; 2537 if (reuse == MAT_INITIAL_MATRIX) { 2538 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 2539 } else if (reuse == MAT_REUSE_MATRIX) { 2540 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 2541 } 2542 B = *newmat; 2543 2544 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 2545 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 2546 2547 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 2548 if (B->factortype == MAT_FACTOR_NONE) { 2549 Mat_SeqAIJCUSPARSE *spptr; 2550 2551 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2552 spptr->format = MAT_CUSPARSE_CSR; 2553 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2554 B->spptr = spptr; 2555 spptr->deviceMat = NULL; 2556 } else { 2557 Mat_SeqAIJCUSPARSETriFactors *spptr; 2558 2559 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2560 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2561 B->spptr = spptr; 2562 } 2563 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2564 } 2565 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 2566 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 2567 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 2568 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 2569 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 2570 B->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 2571 2572 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 2573 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2574 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 2575 PetscFunctionReturn(0); 2576 } 2577 2578 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 2579 { 2580 PetscErrorCode ierr; 2581 2582 PetscFunctionBegin; 2583 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 2584 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 2585 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2586 ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr); 2587 ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr); 2588 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2589 PetscFunctionReturn(0); 2590 } 2591 2592 /*MC 2593 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 2594 2595 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 2596 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 2597 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 2598 2599 Options Database Keys: 2600 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 2601 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2602 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2603 2604 Level: beginner 2605 2606 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 2607 M*/ 2608 2609 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 2610 2611 2612 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 2613 { 2614 PetscErrorCode ierr; 2615 2616 PetscFunctionBegin; 2617 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2618 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2619 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2620 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2621 PetscFunctionReturn(0); 2622 } 2623 2624 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 2625 { 2626 PetscErrorCode ierr; 2627 cusparseStatus_t stat; 2628 2629 PetscFunctionBegin; 2630 if (*cusparsestruct) { 2631 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 2632 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 2633 delete (*cusparsestruct)->workVector; 2634 delete (*cusparsestruct)->rowoffsets_gpu; 2635 delete (*cusparsestruct)->cooPerm; 2636 delete (*cusparsestruct)->cooPerm_a; 2637 delete (*cusparsestruct)->cooPerm_v; 2638 delete (*cusparsestruct)->cooPerm_w; 2639 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 2640 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2641 cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr); 2642 #endif 2643 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 2644 } 2645 PetscFunctionReturn(0); 2646 } 2647 2648 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 2649 { 2650 PetscFunctionBegin; 2651 if (*mat) { 2652 delete (*mat)->values; 2653 delete (*mat)->column_indices; 2654 delete (*mat)->row_offsets; 2655 delete *mat; 2656 *mat = 0; 2657 } 2658 PetscFunctionReturn(0); 2659 } 2660 2661 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 2662 { 2663 cusparseStatus_t stat; 2664 PetscErrorCode ierr; 2665 2666 PetscFunctionBegin; 2667 if (*trifactor) { 2668 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 2669 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 2670 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 2671 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2672 cudaError_t cerr; 2673 if ((*trifactor)->solveBuffer) {cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 2674 if ((*trifactor)->csr2cscBuffer) {cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 2675 #endif 2676 delete *trifactor; 2677 *trifactor = NULL; 2678 } 2679 PetscFunctionReturn(0); 2680 } 2681 2682 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 2683 { 2684 CsrMatrix *mat; 2685 cusparseStatus_t stat; 2686 cudaError_t err; 2687 2688 PetscFunctionBegin; 2689 if (*matstruct) { 2690 if ((*matstruct)->mat) { 2691 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 2692 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2693 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2694 #else 2695 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 2696 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 2697 #endif 2698 } else { 2699 mat = (CsrMatrix*)(*matstruct)->mat; 2700 CsrMatrix_Destroy(&mat); 2701 } 2702 } 2703 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 2704 delete (*matstruct)->cprowIndices; 2705 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 2706 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 2707 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 2708 2709 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2710 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 2711 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 2712 for (int i=0; i<3; i++) { 2713 if (mdata->cuSpMV[i].initialized) { 2714 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 2715 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 2716 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 2717 } 2718 } 2719 #endif 2720 delete *matstruct; 2721 *matstruct = NULL; 2722 } 2723 PetscFunctionReturn(0); 2724 } 2725 2726 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2727 { 2728 PetscErrorCode ierr; 2729 2730 PetscFunctionBegin; 2731 if (*trifactors) { 2732 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 2733 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 2734 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 2735 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 2736 delete (*trifactors)->rpermIndices; 2737 delete (*trifactors)->cpermIndices; 2738 delete (*trifactors)->workVector; 2739 (*trifactors)->rpermIndices = NULL; 2740 (*trifactors)->cpermIndices = NULL; 2741 (*trifactors)->workVector = NULL; 2742 } 2743 PetscFunctionReturn(0); 2744 } 2745 2746 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2747 { 2748 PetscErrorCode ierr; 2749 cusparseHandle_t handle; 2750 cusparseStatus_t stat; 2751 2752 PetscFunctionBegin; 2753 if (*trifactors) { 2754 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 2755 if (handle = (*trifactors)->handle) { 2756 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 2757 } 2758 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 2759 } 2760 PetscFunctionReturn(0); 2761 } 2762 2763 struct IJCompare 2764 { 2765 __host__ __device__ 2766 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 2767 { 2768 if (t1.get<0>() < t2.get<0>()) return true; 2769 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 2770 return false; 2771 } 2772 }; 2773 2774 struct IJEqual 2775 { 2776 __host__ __device__ 2777 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 2778 { 2779 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 2780 return true; 2781 } 2782 }; 2783 2784 struct IJDiff 2785 { 2786 __host__ __device__ 2787 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 2788 { 2789 return t1 == t2 ? 0 : 1; 2790 } 2791 }; 2792 2793 struct IJSum 2794 { 2795 __host__ __device__ 2796 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 2797 { 2798 return t1||t2; 2799 } 2800 }; 2801 2802 #include <thrust/iterator/discard_iterator.h> 2803 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 2804 { 2805 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2806 CsrMatrix *matrix; 2807 PetscErrorCode ierr; 2808 cudaError_t cerr; 2809 PetscInt n; 2810 2811 PetscFunctionBegin; 2812 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 2813 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 2814 if (!cusp->cooPerm) { 2815 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2816 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2817 PetscFunctionReturn(0); 2818 } 2819 n = cusp->cooPerm->size(); 2820 matrix = (CsrMatrix*)cusp->mat->mat; 2821 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 2822 if (!cusp->cooPerm_v) { cusp->cooPerm_v = new THRUSTARRAY(n); } 2823 ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 2824 if (v) { 2825 cusp->cooPerm_v->assign(v,v+n); 2826 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 2827 } 2828 else thrust::fill(thrust::device,cusp->cooPerm_v->begin(),cusp->cooPerm_v->end(),0.); 2829 if (imode == ADD_VALUES) { 2830 if (cusp->cooPerm_a) { 2831 if (!cusp->cooPerm_w) cusp->cooPerm_w = new THRUSTARRAY(matrix->values->size()); 2832 auto vbit = thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()); 2833 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cusp->cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 2834 thrust::transform(cusp->cooPerm_w->begin(),cusp->cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 2835 } else { 2836 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 2837 matrix->values->begin())); 2838 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 2839 matrix->values->end())); 2840 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 2841 } 2842 } else { 2843 if (cusp->cooPerm_a) { /* non unique values insertion, result is undefined (we cannot guarantee last takes precedence) 2844 if we are inserting two different values into the same location */ 2845 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 2846 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->begin()))); 2847 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 2848 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->end()))); 2849 thrust::for_each(zibit,zieit,VecCUDAEquals()); 2850 } else { 2851 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 2852 matrix->values->begin())); 2853 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 2854 matrix->values->end())); 2855 thrust::for_each(zibit,zieit,VecCUDAEquals()); 2856 } 2857 } 2858 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2859 ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 2860 A->offloadmask = PETSC_OFFLOAD_GPU; 2861 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2862 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2863 /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */ 2864 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 2865 PetscFunctionReturn(0); 2866 } 2867 2868 #include <thrust/binary_search.h> 2869 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 2870 { 2871 PetscErrorCode ierr; 2872 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2873 CsrMatrix *matrix; 2874 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2875 PetscInt cooPerm_n, nzr = 0; 2876 cudaError_t cerr; 2877 2878 PetscFunctionBegin; 2879 ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr); 2880 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 2881 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 2882 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 2883 if (n != cooPerm_n) { 2884 delete cusp->cooPerm; 2885 delete cusp->cooPerm_v; 2886 delete cusp->cooPerm_w; 2887 delete cusp->cooPerm_a; 2888 cusp->cooPerm = NULL; 2889 cusp->cooPerm_v = NULL; 2890 cusp->cooPerm_w = NULL; 2891 cusp->cooPerm_a = NULL; 2892 } 2893 if (n) { 2894 THRUSTINTARRAY d_i(n); 2895 THRUSTINTARRAY d_j(n); 2896 THRUSTINTARRAY ii(A->rmap->n); 2897 2898 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 2899 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 2900 2901 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 2902 d_i.assign(coo_i,coo_i+n); 2903 d_j.assign(coo_j,coo_j+n); 2904 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 2905 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 2906 2907 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 2908 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 2909 *cusp->cooPerm_a = d_i; 2910 THRUSTINTARRAY w = d_j; 2911 2912 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 2913 if (nekey == ekey) { /* all entries are unique */ 2914 delete cusp->cooPerm_a; 2915 cusp->cooPerm_a = NULL; 2916 } else { /* I couldn't come up with a more elegant algorithm */ 2917 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 2918 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 2919 (*cusp->cooPerm_a)[0] = 0; 2920 w[0] = 0; 2921 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 2922 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 2923 } 2924 thrust::counting_iterator<PetscInt> search_begin(0); 2925 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 2926 search_begin, search_begin + A->rmap->n, 2927 ii.begin()); 2928 2929 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 2930 a->singlemalloc = PETSC_FALSE; 2931 a->free_a = PETSC_TRUE; 2932 a->free_ij = PETSC_TRUE; 2933 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 2934 a->i[0] = 0; 2935 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2936 a->nz = a->maxnz = a->i[A->rmap->n]; 2937 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 2938 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 2939 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2940 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 2941 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 2942 for (PetscInt i = 0; i < A->rmap->n; i++) { 2943 const PetscInt nnzr = a->i[i+1] - a->i[i]; 2944 nzr += (PetscInt)!!(nnzr); 2945 a->ilen[i] = a->imax[i] = nnzr; 2946 } 2947 A->preallocated = PETSC_TRUE; 2948 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 2949 } else { 2950 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 2951 } 2952 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2953 ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr); 2954 2955 /* We want to allocate the CUSPARSE struct for matvec now. 2956 The code is so convoluted now that I prefer to copy garbage to the GPU */ 2957 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 2958 A->offloadmask = PETSC_OFFLOAD_CPU; 2959 A->nonzerostate++; 2960 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2961 { 2962 matrix = (CsrMatrix*)cusp->mat->mat; 2963 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 2964 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 2965 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 2966 } 2967 2968 A->offloadmask = PETSC_OFFLOAD_CPU; 2969 A->assembled = PETSC_FALSE; 2970 A->was_assembled = PETSC_FALSE; 2971 PetscFunctionReturn(0); 2972 } 2973