1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 68 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 69 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 74 75 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 76 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 78 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 80 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 81 82 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 83 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 84 85 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 86 { 87 cusparseStatus_t stat; 88 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 89 90 PetscFunctionBegin; 91 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 92 cusparsestruct->stream = stream; 93 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 94 PetscFunctionReturn(0); 95 } 96 97 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 98 { 99 cusparseStatus_t stat; 100 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 101 102 PetscFunctionBegin; 103 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 104 if (cusparsestruct->handle != handle) { 105 if (cusparsestruct->handle) { 106 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 107 } 108 cusparsestruct->handle = handle; 109 } 110 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 111 PetscFunctionReturn(0); 112 } 113 114 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 115 { 116 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 117 PetscBool flg; 118 PetscErrorCode ierr; 119 120 PetscFunctionBegin; 121 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 122 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 123 if (cusparsestruct->handle) cusparsestruct->handle = 0; 124 PetscFunctionReturn(0); 125 } 126 127 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 128 { 129 PetscFunctionBegin; 130 *type = MATSOLVERCUSPARSE; 131 PetscFunctionReturn(0); 132 } 133 134 /*MC 135 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 136 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 137 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 138 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 139 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 140 algorithms are not recommended. This class does NOT support direct solver operations. 141 142 Level: beginner 143 144 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 145 M*/ 146 147 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 148 { 149 PetscErrorCode ierr; 150 PetscInt n = A->rmap->n; 151 152 PetscFunctionBegin; 153 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 154 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 155 (*B)->factortype = ftype; 156 (*B)->useordering = PETSC_TRUE; 157 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 158 159 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 160 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 161 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 162 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 163 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 164 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 165 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 166 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 167 168 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 169 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 170 PetscFunctionReturn(0); 171 } 172 173 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 174 { 175 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 176 177 PetscFunctionBegin; 178 switch (op) { 179 case MAT_CUSPARSE_MULT: 180 cusparsestruct->format = format; 181 break; 182 case MAT_CUSPARSE_ALL: 183 cusparsestruct->format = format; 184 break; 185 default: 186 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 187 } 188 PetscFunctionReturn(0); 189 } 190 191 /*@ 192 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 193 operation. Only the MatMult operation can use different GPU storage formats 194 for MPIAIJCUSPARSE matrices. 195 Not Collective 196 197 Input Parameters: 198 + A - Matrix of type SEQAIJCUSPARSE 199 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 200 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 201 202 Output Parameter: 203 204 Level: intermediate 205 206 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 207 @*/ 208 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 209 { 210 PetscErrorCode ierr; 211 212 PetscFunctionBegin; 213 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 214 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 215 PetscFunctionReturn(0); 216 } 217 218 /*@ 219 MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose 220 221 Collective on mat 222 223 Input Parameters: 224 + A - Matrix of type SEQAIJCUSPARSE 225 - transgen - the boolean flag 226 227 Level: intermediate 228 229 .seealso: MATSEQAIJCUSPARSE 230 @*/ 231 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 232 { 233 PetscErrorCode ierr; 234 PetscBool flg; 235 236 PetscFunctionBegin; 237 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 238 ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 239 if (flg) { 240 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 241 242 if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 243 cusp->transgen = transgen; 244 if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 245 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 246 } 247 } 248 PetscFunctionReturn(0); 249 } 250 251 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 252 { 253 PetscErrorCode ierr; 254 MatCUSPARSEStorageFormat format; 255 PetscBool flg; 256 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 257 258 PetscFunctionBegin; 259 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 260 if (A->factortype == MAT_FACTOR_NONE) { 261 PetscBool transgen = cusparsestruct->transgen; 262 263 ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 264 if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);} 265 266 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 267 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 268 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 269 270 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 271 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 272 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 273 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 274 cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 275 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 276 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 277 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 278 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 279 280 cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 281 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 282 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 283 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 284 285 cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 286 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 287 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 288 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 289 #endif 290 } 291 ierr = PetscOptionsTail();CHKERRQ(ierr); 292 PetscFunctionReturn(0); 293 } 294 295 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 296 { 297 PetscErrorCode ierr; 298 299 PetscFunctionBegin; 300 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 301 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 302 PetscFunctionReturn(0); 303 } 304 305 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 306 { 307 PetscErrorCode ierr; 308 309 PetscFunctionBegin; 310 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 311 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 312 PetscFunctionReturn(0); 313 } 314 315 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 316 { 317 PetscErrorCode ierr; 318 319 PetscFunctionBegin; 320 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 321 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 322 PetscFunctionReturn(0); 323 } 324 325 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 326 { 327 PetscErrorCode ierr; 328 329 PetscFunctionBegin; 330 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 331 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 332 PetscFunctionReturn(0); 333 } 334 335 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 336 { 337 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 338 PetscInt n = A->rmap->n; 339 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 340 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 341 cusparseStatus_t stat; 342 const PetscInt *ai = a->i,*aj = a->j,*vi; 343 const MatScalar *aa = a->a,*v; 344 PetscInt *AiLo, *AjLo; 345 PetscScalar *AALo; 346 PetscInt i,nz, nzLower, offset, rowOffset; 347 PetscErrorCode ierr; 348 cudaError_t cerr; 349 350 PetscFunctionBegin; 351 if (!n) PetscFunctionReturn(0); 352 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 353 try { 354 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 355 nzLower=n+ai[n]-ai[1]; 356 357 /* Allocate Space for the lower triangular matrix */ 358 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 359 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 360 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 361 362 /* Fill the lower triangular matrix */ 363 AiLo[0] = (PetscInt) 0; 364 AiLo[n] = nzLower; 365 AjLo[0] = (PetscInt) 0; 366 AALo[0] = (MatScalar) 1.0; 367 v = aa; 368 vi = aj; 369 offset = 1; 370 rowOffset= 1; 371 for (i=1; i<n; i++) { 372 nz = ai[i+1] - ai[i]; 373 /* additional 1 for the term on the diagonal */ 374 AiLo[i] = rowOffset; 375 rowOffset += nz+1; 376 377 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 378 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 379 380 offset += nz; 381 AjLo[offset] = (PetscInt) i; 382 AALo[offset] = (MatScalar) 1.0; 383 offset += 1; 384 385 v += nz; 386 vi += nz; 387 } 388 389 /* allocate space for the triangular factor information */ 390 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 391 392 /* Create the matrix description */ 393 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 394 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 395 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 396 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 397 #else 398 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 399 #endif 400 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 401 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 402 403 /* set the operation */ 404 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 405 406 /* set the matrix */ 407 loTriFactor->csrMat = new CsrMatrix; 408 loTriFactor->csrMat->num_rows = n; 409 loTriFactor->csrMat->num_cols = n; 410 loTriFactor->csrMat->num_entries = nzLower; 411 412 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 413 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 414 415 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 416 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 417 418 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 419 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 420 421 /* Create the solve analysis information */ 422 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 423 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 424 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 425 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 426 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 427 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 428 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 429 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 430 #endif 431 432 /* perform the solve analysis */ 433 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 434 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 435 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 436 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 437 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 438 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 439 #endif 440 );CHKERRCUSPARSE(stat); 441 442 /* assign the pointer. Is this really necessary? */ 443 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 444 445 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 446 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 447 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 448 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 449 } catch(char *ex) { 450 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 451 } 452 } 453 PetscFunctionReturn(0); 454 } 455 456 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 457 { 458 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 459 PetscInt n = A->rmap->n; 460 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 461 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 462 cusparseStatus_t stat; 463 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 464 const MatScalar *aa = a->a,*v; 465 PetscInt *AiUp, *AjUp; 466 PetscScalar *AAUp; 467 PetscInt i,nz, nzUpper, offset; 468 PetscErrorCode ierr; 469 cudaError_t cerr; 470 471 PetscFunctionBegin; 472 if (!n) PetscFunctionReturn(0); 473 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 474 try { 475 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 476 nzUpper = adiag[0]-adiag[n]; 477 478 /* Allocate Space for the upper triangular matrix */ 479 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 480 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 481 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 482 483 /* Fill the upper triangular matrix */ 484 AiUp[0]=(PetscInt) 0; 485 AiUp[n]=nzUpper; 486 offset = nzUpper; 487 for (i=n-1; i>=0; i--) { 488 v = aa + adiag[i+1] + 1; 489 vi = aj + adiag[i+1] + 1; 490 491 /* number of elements NOT on the diagonal */ 492 nz = adiag[i] - adiag[i+1]-1; 493 494 /* decrement the offset */ 495 offset -= (nz+1); 496 497 /* first, set the diagonal elements */ 498 AjUp[offset] = (PetscInt) i; 499 AAUp[offset] = (MatScalar)1./v[nz]; 500 AiUp[i] = AiUp[i+1] - (nz+1); 501 502 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 503 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 504 } 505 506 /* allocate space for the triangular factor information */ 507 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 508 509 /* Create the matrix description */ 510 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 511 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 512 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 513 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 514 #else 515 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 516 #endif 517 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 518 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 519 520 /* set the operation */ 521 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 522 523 /* set the matrix */ 524 upTriFactor->csrMat = new CsrMatrix; 525 upTriFactor->csrMat->num_rows = n; 526 upTriFactor->csrMat->num_cols = n; 527 upTriFactor->csrMat->num_entries = nzUpper; 528 529 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 530 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 531 532 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 533 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 534 535 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 536 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 537 538 /* Create the solve analysis information */ 539 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 540 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 541 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 542 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 543 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 544 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 545 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 546 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 547 #endif 548 549 /* perform the solve analysis */ 550 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 551 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 552 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 553 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 554 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 555 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 556 #endif 557 );CHKERRCUSPARSE(stat); 558 559 /* assign the pointer. Is this really necessary? */ 560 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 561 562 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 563 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 564 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 565 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 566 } catch(char *ex) { 567 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 568 } 569 } 570 PetscFunctionReturn(0); 571 } 572 573 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 574 { 575 PetscErrorCode ierr; 576 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 577 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 578 IS isrow = a->row,iscol = a->icol; 579 PetscBool row_identity,col_identity; 580 const PetscInt *r,*c; 581 PetscInt n = A->rmap->n; 582 583 PetscFunctionBegin; 584 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 585 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 586 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 587 588 cusparseTriFactors->workVector = new THRUSTARRAY(n); 589 cusparseTriFactors->nnz=a->nz; 590 591 A->offloadmask = PETSC_OFFLOAD_BOTH; 592 /* lower triangular indices */ 593 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 594 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 595 if (!row_identity) { 596 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 597 cusparseTriFactors->rpermIndices->assign(r, r+n); 598 } 599 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 600 601 /* upper triangular indices */ 602 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 603 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 604 if (!col_identity) { 605 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 606 cusparseTriFactors->cpermIndices->assign(c, c+n); 607 } 608 609 if (!row_identity && !col_identity) { 610 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 611 } else if(!row_identity) { 612 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 613 } else if(!col_identity) { 614 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 615 } 616 617 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 618 PetscFunctionReturn(0); 619 } 620 621 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 622 { 623 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 624 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 625 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 626 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 627 cusparseStatus_t stat; 628 PetscErrorCode ierr; 629 cudaError_t cerr; 630 PetscInt *AiUp, *AjUp; 631 PetscScalar *AAUp; 632 PetscScalar *AALo; 633 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 634 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 635 const PetscInt *ai = b->i,*aj = b->j,*vj; 636 const MatScalar *aa = b->a,*v; 637 638 PetscFunctionBegin; 639 if (!n) PetscFunctionReturn(0); 640 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 641 try { 642 /* Allocate Space for the upper triangular matrix */ 643 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 644 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 645 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 646 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 647 648 /* Fill the upper triangular matrix */ 649 AiUp[0]=(PetscInt) 0; 650 AiUp[n]=nzUpper; 651 offset = 0; 652 for (i=0; i<n; i++) { 653 /* set the pointers */ 654 v = aa + ai[i]; 655 vj = aj + ai[i]; 656 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 657 658 /* first, set the diagonal elements */ 659 AjUp[offset] = (PetscInt) i; 660 AAUp[offset] = (MatScalar)1.0/v[nz]; 661 AiUp[i] = offset; 662 AALo[offset] = (MatScalar)1.0/v[nz]; 663 664 offset+=1; 665 if (nz>0) { 666 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 667 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 668 for (j=offset; j<offset+nz; j++) { 669 AAUp[j] = -AAUp[j]; 670 AALo[j] = AAUp[j]/v[nz]; 671 } 672 offset+=nz; 673 } 674 } 675 676 /* allocate space for the triangular factor information */ 677 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 678 679 /* Create the matrix description */ 680 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 681 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 682 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 683 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 684 #else 685 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 686 #endif 687 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 688 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 689 690 /* set the matrix */ 691 upTriFactor->csrMat = new CsrMatrix; 692 upTriFactor->csrMat->num_rows = A->rmap->n; 693 upTriFactor->csrMat->num_cols = A->cmap->n; 694 upTriFactor->csrMat->num_entries = a->nz; 695 696 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 697 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 698 699 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 700 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 701 702 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 703 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 704 705 /* set the operation */ 706 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 707 708 /* Create the solve analysis information */ 709 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 710 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 711 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 712 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 713 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 714 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 715 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 716 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 717 #endif 718 719 /* perform the solve analysis */ 720 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 721 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 722 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 723 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 724 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 725 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 726 #endif 727 );CHKERRCUSPARSE(stat); 728 729 /* assign the pointer. Is this really necessary? */ 730 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 731 732 /* allocate space for the triangular factor information */ 733 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 734 735 /* Create the matrix description */ 736 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 737 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 738 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 739 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 740 #else 741 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 742 #endif 743 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 744 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 745 746 /* set the operation */ 747 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 748 749 /* set the matrix */ 750 loTriFactor->csrMat = new CsrMatrix; 751 loTriFactor->csrMat->num_rows = A->rmap->n; 752 loTriFactor->csrMat->num_cols = A->cmap->n; 753 loTriFactor->csrMat->num_entries = a->nz; 754 755 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 756 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 757 758 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 759 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 760 761 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 762 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 763 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 764 765 /* Create the solve analysis information */ 766 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 767 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 768 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 769 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 770 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 771 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 772 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 773 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 774 #endif 775 776 /* perform the solve analysis */ 777 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 778 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 779 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 780 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 781 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 782 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 783 #endif 784 );CHKERRCUSPARSE(stat); 785 786 /* assign the pointer. Is this really necessary? */ 787 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 788 789 A->offloadmask = PETSC_OFFLOAD_BOTH; 790 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 791 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 792 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 793 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 794 } catch(char *ex) { 795 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 796 } 797 } 798 PetscFunctionReturn(0); 799 } 800 801 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 802 { 803 PetscErrorCode ierr; 804 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 805 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 806 IS ip = a->row; 807 const PetscInt *rip; 808 PetscBool perm_identity; 809 PetscInt n = A->rmap->n; 810 811 PetscFunctionBegin; 812 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 813 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 814 cusparseTriFactors->workVector = new THRUSTARRAY(n); 815 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 816 817 /* lower triangular indices */ 818 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 819 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 820 if (!perm_identity) { 821 IS iip; 822 const PetscInt *irip; 823 824 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 825 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 826 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 827 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 828 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 829 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 830 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 831 ierr = ISDestroy(&iip);CHKERRQ(ierr); 832 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 833 } 834 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 835 PetscFunctionReturn(0); 836 } 837 838 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 839 { 840 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 841 IS isrow = b->row,iscol = b->col; 842 PetscBool row_identity,col_identity; 843 PetscErrorCode ierr; 844 845 PetscFunctionBegin; 846 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 847 B->offloadmask = PETSC_OFFLOAD_CPU; 848 /* determine which version of MatSolve needs to be used. */ 849 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 850 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 851 if (row_identity && col_identity) { 852 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 853 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 854 B->ops->matsolve = NULL; 855 B->ops->matsolvetranspose = NULL; 856 } else { 857 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 858 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 859 B->ops->matsolve = NULL; 860 B->ops->matsolvetranspose = NULL; 861 } 862 863 /* get the triangular factors */ 864 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 865 PetscFunctionReturn(0); 866 } 867 868 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 869 { 870 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 871 IS ip = b->row; 872 PetscBool perm_identity; 873 PetscErrorCode ierr; 874 875 PetscFunctionBegin; 876 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 877 B->offloadmask = PETSC_OFFLOAD_CPU; 878 /* determine which version of MatSolve needs to be used. */ 879 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 880 if (perm_identity) { 881 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 882 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 883 B->ops->matsolve = NULL; 884 B->ops->matsolvetranspose = NULL; 885 } else { 886 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 887 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 888 B->ops->matsolve = NULL; 889 B->ops->matsolvetranspose = NULL; 890 } 891 892 /* get the triangular factors */ 893 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 894 PetscFunctionReturn(0); 895 } 896 897 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 898 { 899 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 900 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 901 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 902 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 903 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 904 cusparseStatus_t stat; 905 cusparseIndexBase_t indexBase; 906 cusparseMatrixType_t matrixType; 907 cusparseFillMode_t fillMode; 908 cusparseDiagType_t diagType; 909 cudaError_t cerr; 910 911 PetscFunctionBegin; 912 913 /*********************************************/ 914 /* Now the Transpose of the Lower Tri Factor */ 915 /*********************************************/ 916 917 /* allocate space for the transpose of the lower triangular factor */ 918 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 919 920 /* set the matrix descriptors of the lower triangular factor */ 921 matrixType = cusparseGetMatType(loTriFactor->descr); 922 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 923 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 924 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 925 diagType = cusparseGetMatDiagType(loTriFactor->descr); 926 927 /* Create the matrix description */ 928 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 929 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 930 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 931 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 932 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 933 934 /* set the operation */ 935 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 936 937 /* allocate GPU space for the CSC of the lower triangular factor*/ 938 loTriFactorT->csrMat = new CsrMatrix; 939 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 940 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 941 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 942 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 943 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 944 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 945 946 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 947 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 948 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 949 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 950 loTriFactor->csrMat->values->data().get(), 951 loTriFactor->csrMat->row_offsets->data().get(), 952 loTriFactor->csrMat->column_indices->data().get(), 953 loTriFactorT->csrMat->values->data().get(), 954 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 955 CUSPARSE_ACTION_NUMERIC,indexBase, 956 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 957 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 958 #endif 959 960 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 961 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 962 loTriFactor->csrMat->values->data().get(), 963 loTriFactor->csrMat->row_offsets->data().get(), 964 loTriFactor->csrMat->column_indices->data().get(), 965 loTriFactorT->csrMat->values->data().get(), 966 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 967 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 968 CUSPARSE_ACTION_NUMERIC, indexBase, 969 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 970 #else 971 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 972 CUSPARSE_ACTION_NUMERIC, indexBase 973 #endif 974 );CHKERRCUSPARSE(stat); 975 976 /* Create the solve analysis information */ 977 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 978 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 979 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 980 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 981 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 982 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 983 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 984 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 985 #endif 986 987 /* perform the solve analysis */ 988 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 989 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 990 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 991 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 992 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 993 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 994 #endif 995 );CHKERRCUSPARSE(stat); 996 997 /* assign the pointer. Is this really necessary? */ 998 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 999 1000 /*********************************************/ 1001 /* Now the Transpose of the Upper Tri Factor */ 1002 /*********************************************/ 1003 1004 /* allocate space for the transpose of the upper triangular factor */ 1005 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 1006 1007 /* set the matrix descriptors of the upper triangular factor */ 1008 matrixType = cusparseGetMatType(upTriFactor->descr); 1009 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1010 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1011 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1012 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1013 1014 /* Create the matrix description */ 1015 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1016 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1017 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1018 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1019 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1020 1021 /* set the operation */ 1022 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1023 1024 /* allocate GPU space for the CSC of the upper triangular factor*/ 1025 upTriFactorT->csrMat = new CsrMatrix; 1026 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1027 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1028 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1029 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1030 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1031 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1032 1033 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1034 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1035 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1036 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1037 upTriFactor->csrMat->values->data().get(), 1038 upTriFactor->csrMat->row_offsets->data().get(), 1039 upTriFactor->csrMat->column_indices->data().get(), 1040 upTriFactorT->csrMat->values->data().get(), 1041 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1042 CUSPARSE_ACTION_NUMERIC,indexBase, 1043 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1044 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1045 #endif 1046 1047 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1048 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1049 upTriFactor->csrMat->values->data().get(), 1050 upTriFactor->csrMat->row_offsets->data().get(), 1051 upTriFactor->csrMat->column_indices->data().get(), 1052 upTriFactorT->csrMat->values->data().get(), 1053 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1054 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1055 CUSPARSE_ACTION_NUMERIC, indexBase, 1056 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1057 #else 1058 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1059 CUSPARSE_ACTION_NUMERIC, indexBase 1060 #endif 1061 );CHKERRCUSPARSE(stat); 1062 1063 /* Create the solve analysis information */ 1064 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1065 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1066 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1067 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1068 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1069 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1070 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1071 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1072 #endif 1073 1074 /* perform the solve analysis */ 1075 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1076 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1077 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1078 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1079 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1080 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1081 #endif 1082 );CHKERRCUSPARSE(stat); 1083 1084 /* assign the pointer. Is this really necessary? */ 1085 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1086 PetscFunctionReturn(0); 1087 } 1088 1089 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 1090 { 1091 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1092 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1093 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1094 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1095 cusparseStatus_t stat; 1096 cusparseIndexBase_t indexBase; 1097 cudaError_t err; 1098 PetscErrorCode ierr; 1099 1100 PetscFunctionBegin; 1101 if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0); 1102 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1103 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1104 /* create cusparse matrix */ 1105 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1106 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1107 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1108 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1109 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1110 1111 /* set alpha and beta */ 1112 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1113 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1114 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1115 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1116 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1117 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1118 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1119 1120 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1121 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1122 CsrMatrix *matrixT= new CsrMatrix; 1123 matrixT->num_rows = A->cmap->n; 1124 matrixT->num_cols = A->rmap->n; 1125 matrixT->num_entries = a->nz; 1126 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1127 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1128 matrixT->values = new THRUSTARRAY(a->nz); 1129 1130 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 1131 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1132 1133 /* compute the transpose, i.e. the CSC */ 1134 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1135 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1136 A->cmap->n, matrix->num_entries, 1137 matrix->values->data().get(), 1138 cusparsestruct->rowoffsets_gpu->data().get(), 1139 matrix->column_indices->data().get(), 1140 matrixT->values->data().get(), 1141 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1142 CUSPARSE_ACTION_NUMERIC,indexBase, 1143 cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1144 err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err); 1145 #endif 1146 1147 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1148 A->cmap->n, matrix->num_entries, 1149 matrix->values->data().get(), 1150 cusparsestruct->rowoffsets_gpu->data().get(), 1151 matrix->column_indices->data().get(), 1152 matrixT->values->data().get(), 1153 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1154 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1155 CUSPARSE_ACTION_NUMERIC,indexBase, 1156 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1157 #else 1158 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1159 CUSPARSE_ACTION_NUMERIC, indexBase 1160 #endif 1161 );CHKERRCUSPARSE(stat); 1162 matstructT->mat = matrixT; 1163 1164 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1165 stat = cusparseCreateCsr(&matstructT->matDescr, 1166 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1167 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1168 matrixT->values->data().get(), 1169 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1170 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1171 #endif 1172 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1173 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1174 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1175 #else 1176 CsrMatrix *temp = new CsrMatrix; 1177 CsrMatrix *tempT = new CsrMatrix; 1178 /* First convert HYB to CSR */ 1179 temp->num_rows = A->rmap->n; 1180 temp->num_cols = A->cmap->n; 1181 temp->num_entries = a->nz; 1182 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1183 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1184 temp->values = new THRUSTARRAY(a->nz); 1185 1186 stat = cusparse_hyb2csr(cusparsestruct->handle, 1187 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1188 temp->values->data().get(), 1189 temp->row_offsets->data().get(), 1190 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1191 1192 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1193 tempT->num_rows = A->rmap->n; 1194 tempT->num_cols = A->cmap->n; 1195 tempT->num_entries = a->nz; 1196 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1197 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1198 tempT->values = new THRUSTARRAY(a->nz); 1199 1200 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1201 temp->num_cols, temp->num_entries, 1202 temp->values->data().get(), 1203 temp->row_offsets->data().get(), 1204 temp->column_indices->data().get(), 1205 tempT->values->data().get(), 1206 tempT->column_indices->data().get(), 1207 tempT->row_offsets->data().get(), 1208 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1209 1210 /* Last, convert CSC to HYB */ 1211 cusparseHybMat_t hybMat; 1212 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1213 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1214 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1215 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1216 matstructT->descr, tempT->values->data().get(), 1217 tempT->row_offsets->data().get(), 1218 tempT->column_indices->data().get(), 1219 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1220 1221 /* assign the pointer */ 1222 matstructT->mat = hybMat; 1223 /* delete temporaries */ 1224 if (tempT) { 1225 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1226 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1227 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1228 delete (CsrMatrix*) tempT; 1229 } 1230 if (temp) { 1231 if (temp->values) delete (THRUSTARRAY*) temp->values; 1232 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1233 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1234 delete (CsrMatrix*) temp; 1235 } 1236 #endif 1237 } 1238 err = WaitForCUDA();CHKERRCUDA(err); 1239 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1240 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1241 /* the compressed row indices is not used for matTranspose */ 1242 matstructT->cprowIndices = NULL; 1243 /* assign the pointer */ 1244 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1245 PetscFunctionReturn(0); 1246 } 1247 1248 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1249 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1250 { 1251 PetscInt n = xx->map->n; 1252 const PetscScalar *barray; 1253 PetscScalar *xarray; 1254 thrust::device_ptr<const PetscScalar> bGPU; 1255 thrust::device_ptr<PetscScalar> xGPU; 1256 cusparseStatus_t stat; 1257 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1258 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1259 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1260 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1261 PetscErrorCode ierr; 1262 cudaError_t cerr; 1263 1264 PetscFunctionBegin; 1265 /* Analyze the matrix and create the transpose ... on the fly */ 1266 if (!loTriFactorT && !upTriFactorT) { 1267 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1268 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1269 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1270 } 1271 1272 /* Get the GPU pointers */ 1273 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1274 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1275 xGPU = thrust::device_pointer_cast(xarray); 1276 bGPU = thrust::device_pointer_cast(barray); 1277 1278 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1279 /* First, reorder with the row permutation */ 1280 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1281 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1282 xGPU); 1283 1284 /* First, solve U */ 1285 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1286 upTriFactorT->csrMat->num_rows, 1287 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1288 upTriFactorT->csrMat->num_entries, 1289 #endif 1290 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1291 upTriFactorT->csrMat->values->data().get(), 1292 upTriFactorT->csrMat->row_offsets->data().get(), 1293 upTriFactorT->csrMat->column_indices->data().get(), 1294 upTriFactorT->solveInfo, 1295 xarray, tempGPU->data().get() 1296 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1297 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1298 #endif 1299 );CHKERRCUSPARSE(stat); 1300 1301 /* Then, solve L */ 1302 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1303 loTriFactorT->csrMat->num_rows, 1304 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1305 loTriFactorT->csrMat->num_entries, 1306 #endif 1307 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1308 loTriFactorT->csrMat->values->data().get(), 1309 loTriFactorT->csrMat->row_offsets->data().get(), 1310 loTriFactorT->csrMat->column_indices->data().get(), 1311 loTriFactorT->solveInfo, 1312 tempGPU->data().get(), xarray 1313 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1314 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1315 #endif 1316 );CHKERRCUSPARSE(stat); 1317 1318 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1319 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1320 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1321 tempGPU->begin()); 1322 1323 /* Copy the temporary to the full solution. */ 1324 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1325 1326 /* restore */ 1327 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1328 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1329 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1330 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1331 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1332 PetscFunctionReturn(0); 1333 } 1334 1335 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1336 { 1337 const PetscScalar *barray; 1338 PetscScalar *xarray; 1339 cusparseStatus_t stat; 1340 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1341 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1342 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1343 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1344 PetscErrorCode ierr; 1345 cudaError_t cerr; 1346 1347 PetscFunctionBegin; 1348 /* Analyze the matrix and create the transpose ... on the fly */ 1349 if (!loTriFactorT && !upTriFactorT) { 1350 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1351 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1352 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1353 } 1354 1355 /* Get the GPU pointers */ 1356 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1357 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1358 1359 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1360 /* First, solve U */ 1361 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1362 upTriFactorT->csrMat->num_rows, 1363 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1364 upTriFactorT->csrMat->num_entries, 1365 #endif 1366 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1367 upTriFactorT->csrMat->values->data().get(), 1368 upTriFactorT->csrMat->row_offsets->data().get(), 1369 upTriFactorT->csrMat->column_indices->data().get(), 1370 upTriFactorT->solveInfo, 1371 barray, tempGPU->data().get() 1372 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1373 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1374 #endif 1375 );CHKERRCUSPARSE(stat); 1376 1377 /* Then, solve L */ 1378 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1379 loTriFactorT->csrMat->num_rows, 1380 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1381 loTriFactorT->csrMat->num_entries, 1382 #endif 1383 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1384 loTriFactorT->csrMat->values->data().get(), 1385 loTriFactorT->csrMat->row_offsets->data().get(), 1386 loTriFactorT->csrMat->column_indices->data().get(), 1387 loTriFactorT->solveInfo, 1388 tempGPU->data().get(), xarray 1389 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1390 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1391 #endif 1392 );CHKERRCUSPARSE(stat); 1393 1394 /* restore */ 1395 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1396 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1397 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1398 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1399 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1400 PetscFunctionReturn(0); 1401 } 1402 1403 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1404 { 1405 const PetscScalar *barray; 1406 PetscScalar *xarray; 1407 thrust::device_ptr<const PetscScalar> bGPU; 1408 thrust::device_ptr<PetscScalar> xGPU; 1409 cusparseStatus_t stat; 1410 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1411 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1412 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1413 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1414 PetscErrorCode ierr; 1415 cudaError_t cerr; 1416 1417 PetscFunctionBegin; 1418 1419 /* Get the GPU pointers */ 1420 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1421 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1422 xGPU = thrust::device_pointer_cast(xarray); 1423 bGPU = thrust::device_pointer_cast(barray); 1424 1425 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1426 /* First, reorder with the row permutation */ 1427 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1428 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1429 tempGPU->begin()); 1430 1431 /* Next, solve L */ 1432 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1433 loTriFactor->csrMat->num_rows, 1434 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1435 loTriFactor->csrMat->num_entries, 1436 #endif 1437 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1438 loTriFactor->csrMat->values->data().get(), 1439 loTriFactor->csrMat->row_offsets->data().get(), 1440 loTriFactor->csrMat->column_indices->data().get(), 1441 loTriFactor->solveInfo, 1442 tempGPU->data().get(), xarray 1443 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1444 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1445 #endif 1446 );CHKERRCUSPARSE(stat); 1447 1448 /* Then, solve U */ 1449 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1450 upTriFactor->csrMat->num_rows, 1451 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1452 upTriFactor->csrMat->num_entries, 1453 #endif 1454 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1455 upTriFactor->csrMat->values->data().get(), 1456 upTriFactor->csrMat->row_offsets->data().get(), 1457 upTriFactor->csrMat->column_indices->data().get(), 1458 upTriFactor->solveInfo, 1459 xarray, tempGPU->data().get() 1460 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1461 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1462 #endif 1463 );CHKERRCUSPARSE(stat); 1464 1465 /* Last, reorder with the column permutation */ 1466 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1467 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1468 xGPU); 1469 1470 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1471 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1472 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1473 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1474 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1475 PetscFunctionReturn(0); 1476 } 1477 1478 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1479 { 1480 const PetscScalar *barray; 1481 PetscScalar *xarray; 1482 cusparseStatus_t stat; 1483 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1484 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1485 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1486 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1487 PetscErrorCode ierr; 1488 cudaError_t cerr; 1489 1490 PetscFunctionBegin; 1491 /* Get the GPU pointers */ 1492 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1493 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1494 1495 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1496 /* First, solve L */ 1497 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1498 loTriFactor->csrMat->num_rows, 1499 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1500 loTriFactor->csrMat->num_entries, 1501 #endif 1502 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1503 loTriFactor->csrMat->values->data().get(), 1504 loTriFactor->csrMat->row_offsets->data().get(), 1505 loTriFactor->csrMat->column_indices->data().get(), 1506 loTriFactor->solveInfo, 1507 barray, tempGPU->data().get() 1508 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1509 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1510 #endif 1511 );CHKERRCUSPARSE(stat); 1512 1513 /* Next, solve U */ 1514 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1515 upTriFactor->csrMat->num_rows, 1516 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1517 upTriFactor->csrMat->num_entries, 1518 #endif 1519 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1520 upTriFactor->csrMat->values->data().get(), 1521 upTriFactor->csrMat->row_offsets->data().get(), 1522 upTriFactor->csrMat->column_indices->data().get(), 1523 upTriFactor->solveInfo, 1524 tempGPU->data().get(), xarray 1525 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1526 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1527 #endif 1528 );CHKERRCUSPARSE(stat); 1529 1530 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1531 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1532 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1533 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1534 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1535 PetscFunctionReturn(0); 1536 } 1537 1538 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1539 { 1540 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1541 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1542 cudaError_t cerr; 1543 PetscErrorCode ierr; 1544 1545 PetscFunctionBegin; 1546 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1547 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1548 1549 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1550 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1551 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1552 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1553 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1554 A->offloadmask = PETSC_OFFLOAD_BOTH; 1555 } 1556 PetscFunctionReturn(0); 1557 } 1558 1559 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1560 { 1561 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1562 PetscErrorCode ierr; 1563 1564 PetscFunctionBegin; 1565 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1566 *array = a->a; 1567 A->offloadmask = PETSC_OFFLOAD_CPU; 1568 PetscFunctionReturn(0); 1569 } 1570 1571 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1572 { 1573 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1574 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1575 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1576 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1577 PetscErrorCode ierr; 1578 cusparseStatus_t stat; 1579 cudaError_t err; 1580 1581 PetscFunctionBegin; 1582 if (A->boundtocpu) PetscFunctionReturn(0); 1583 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1584 if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1585 /* Copy values only */ 1586 CsrMatrix *matrix,*matrixT; 1587 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1588 1589 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1590 matrix->values->assign(a->a, a->a+a->nz); 1591 err = WaitForCUDA();CHKERRCUDA(err); 1592 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1593 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1594 1595 /* Update matT when it was built before */ 1596 if (cusparsestruct->matTranspose) { 1597 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1598 matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1599 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1600 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1601 A->cmap->n, matrix->num_entries, 1602 matrix->values->data().get(), 1603 cusparsestruct->rowoffsets_gpu->data().get(), 1604 matrix->column_indices->data().get(), 1605 matrixT->values->data().get(), 1606 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1607 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1608 CUSPARSE_ACTION_NUMERIC,indexBase, 1609 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1610 #else 1611 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1612 CUSPARSE_ACTION_NUMERIC, indexBase 1613 #endif 1614 );CHKERRCUSPARSE(stat); 1615 err = WaitForCUDA();CHKERRCUDA(err); 1616 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1617 } 1618 } else { 1619 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1620 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1621 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1622 delete cusparsestruct->workVector; 1623 delete cusparsestruct->rowoffsets_gpu; 1624 try { 1625 if (a->compressedrow.use) { 1626 m = a->compressedrow.nrows; 1627 ii = a->compressedrow.i; 1628 ridx = a->compressedrow.rindex; 1629 } else { 1630 m = A->rmap->n; 1631 ii = a->i; 1632 ridx = NULL; 1633 } 1634 cusparsestruct->nrows = m; 1635 1636 /* create cusparse matrix */ 1637 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1638 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1639 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1640 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1641 1642 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1643 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1644 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1645 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1646 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1647 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1648 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1649 1650 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1651 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1652 /* set the matrix */ 1653 CsrMatrix *mat= new CsrMatrix; 1654 mat->num_rows = m; 1655 mat->num_cols = A->cmap->n; 1656 mat->num_entries = a->nz; 1657 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1658 mat->row_offsets->assign(ii, ii + m+1); 1659 1660 mat->column_indices = new THRUSTINTARRAY32(a->nz); 1661 mat->column_indices->assign(a->j, a->j+a->nz); 1662 1663 mat->values = new THRUSTARRAY(a->nz); 1664 mat->values->assign(a->a, a->a+a->nz); 1665 1666 /* assign the pointer */ 1667 matstruct->mat = mat; 1668 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1669 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1670 stat = cusparseCreateCsr(&matstruct->matDescr, 1671 mat->num_rows, mat->num_cols, mat->num_entries, 1672 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1673 mat->values->data().get(), 1674 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1675 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1676 } 1677 #endif 1678 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1679 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1680 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1681 #else 1682 CsrMatrix *mat= new CsrMatrix; 1683 mat->num_rows = m; 1684 mat->num_cols = A->cmap->n; 1685 mat->num_entries = a->nz; 1686 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1687 mat->row_offsets->assign(ii, ii + m+1); 1688 1689 mat->column_indices = new THRUSTINTARRAY32(a->nz); 1690 mat->column_indices->assign(a->j, a->j+a->nz); 1691 1692 mat->values = new THRUSTARRAY(a->nz); 1693 mat->values->assign(a->a, a->a+a->nz); 1694 1695 cusparseHybMat_t hybMat; 1696 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1697 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1698 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1699 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1700 matstruct->descr, mat->values->data().get(), 1701 mat->row_offsets->data().get(), 1702 mat->column_indices->data().get(), 1703 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1704 /* assign the pointer */ 1705 matstruct->mat = hybMat; 1706 1707 if (mat) { 1708 if (mat->values) delete (THRUSTARRAY*)mat->values; 1709 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1710 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1711 delete (CsrMatrix*)mat; 1712 } 1713 #endif 1714 } 1715 1716 /* assign the compressed row indices */ 1717 if (a->compressedrow.use) { 1718 cusparsestruct->workVector = new THRUSTARRAY(m); 1719 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1720 matstruct->cprowIndices->assign(ridx,ridx+m); 1721 tmp = m; 1722 } else { 1723 cusparsestruct->workVector = NULL; 1724 matstruct->cprowIndices = NULL; 1725 tmp = 0; 1726 } 1727 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1728 1729 /* assign the pointer */ 1730 cusparsestruct->mat = matstruct; 1731 } catch(char *ex) { 1732 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1733 } 1734 err = WaitForCUDA();CHKERRCUDA(err); 1735 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1736 cusparsestruct->nonzerostate = A->nonzerostate; 1737 } 1738 A->offloadmask = PETSC_OFFLOAD_BOTH; 1739 } 1740 PetscFunctionReturn(0); 1741 } 1742 1743 struct VecCUDAPlusEquals 1744 { 1745 template <typename Tuple> 1746 __host__ __device__ 1747 void operator()(Tuple t) 1748 { 1749 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1750 } 1751 }; 1752 1753 struct VecCUDAEquals 1754 { 1755 template <typename Tuple> 1756 __host__ __device__ 1757 void operator()(Tuple t) 1758 { 1759 thrust::get<1>(t) = thrust::get<0>(t); 1760 } 1761 }; 1762 1763 struct VecCUDAEqualsReverse 1764 { 1765 template <typename Tuple> 1766 __host__ __device__ 1767 void operator()(Tuple t) 1768 { 1769 thrust::get<0>(t) = thrust::get<1>(t); 1770 } 1771 }; 1772 1773 struct MatMatCusparse { 1774 PetscBool cisdense; 1775 PetscScalar *Bt; 1776 Mat X; 1777 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1778 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1779 cusparseDnMatDescr_t matBDescr; 1780 cusparseDnMatDescr_t matCDescr; 1781 size_t spmmBufferSize; 1782 void *spmmBuffer; 1783 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1784 #endif 1785 }; 1786 1787 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1788 { 1789 PetscErrorCode ierr; 1790 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1791 cudaError_t cerr; 1792 1793 PetscFunctionBegin; 1794 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1795 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1796 cusparseStatus_t stat; 1797 if (mmdata->matBDescr) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);} 1798 if (mmdata->matCDescr) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);} 1799 if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 1800 #endif 1801 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1802 ierr = PetscFree(data);CHKERRQ(ierr); 1803 PetscFunctionReturn(0); 1804 } 1805 1806 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1807 1808 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1809 { 1810 Mat_Product *product = C->product; 1811 Mat A,B; 1812 PetscInt m,n,blda,clda; 1813 PetscBool flg,biscuda; 1814 Mat_SeqAIJCUSPARSE *cusp; 1815 cusparseStatus_t stat; 1816 cusparseOperation_t opA; 1817 const PetscScalar *barray; 1818 PetscScalar *carray; 1819 PetscErrorCode ierr; 1820 MatMatCusparse *mmdata; 1821 Mat_SeqAIJCUSPARSEMultStruct *mat; 1822 CsrMatrix *csrmat; 1823 cudaError_t cerr; 1824 1825 PetscFunctionBegin; 1826 MatCheckProduct(C,1); 1827 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1828 mmdata = (MatMatCusparse*)product->data; 1829 A = product->A; 1830 B = product->B; 1831 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1832 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1833 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1834 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1835 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1836 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1837 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1838 switch (product->type) { 1839 case MATPRODUCT_AB: 1840 case MATPRODUCT_PtAP: 1841 mat = cusp->mat; 1842 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1843 m = A->rmap->n; 1844 n = B->cmap->n; 1845 break; 1846 case MATPRODUCT_AtB: 1847 if (!cusp->transgen) { 1848 mat = cusp->mat; 1849 opA = CUSPARSE_OPERATION_TRANSPOSE; 1850 } else { 1851 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1852 mat = cusp->matTranspose; 1853 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1854 } 1855 m = A->cmap->n; 1856 n = B->cmap->n; 1857 break; 1858 case MATPRODUCT_ABt: 1859 case MATPRODUCT_RARt: 1860 mat = cusp->mat; 1861 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1862 m = A->rmap->n; 1863 n = B->rmap->n; 1864 break; 1865 default: 1866 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1867 } 1868 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1869 csrmat = (CsrMatrix*)mat->mat; 1870 /* if the user passed a CPU matrix, copy the data to the GPU */ 1871 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1872 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 1873 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 1874 1875 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 1876 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1877 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1878 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 1879 } else { 1880 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 1881 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 1882 } 1883 1884 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1885 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1886 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 1887 /* (re)allcoate spmmBuffer if not initialized or LDAs are different */ 1888 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 1889 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 1890 if (!mmdata->matBDescr) { 1891 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 1892 mmdata->Blda = blda; 1893 } 1894 1895 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 1896 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 1897 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 1898 mmdata->Clda = clda; 1899 } 1900 1901 if (!mat->matDescr) { 1902 stat = cusparseCreateCsr(&mat->matDescr, 1903 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 1904 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 1905 csrmat->values->data().get(), 1906 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1907 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1908 } 1909 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 1910 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 1911 mmdata->matCDescr,cusparse_scalartype, 1912 cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat); 1913 if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 1914 cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr); 1915 mmdata->initialized = PETSC_TRUE; 1916 } else { 1917 /* to be safe, always update pointers of the mats */ 1918 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 1919 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 1920 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 1921 } 1922 1923 /* do cusparseSpMM, which supports transpose on B */ 1924 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 1925 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 1926 mmdata->matCDescr,cusparse_scalartype, 1927 cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat); 1928 #else 1929 PetscInt k; 1930 /* cusparseXcsrmm does not support transpose on B */ 1931 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1932 cublasHandle_t cublasv2handle; 1933 cublasStatus_t cerr; 1934 1935 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 1936 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 1937 B->cmap->n,B->rmap->n, 1938 &PETSC_CUSPARSE_ONE ,barray,blda, 1939 &PETSC_CUSPARSE_ZERO,barray,blda, 1940 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 1941 blda = B->cmap->n; 1942 k = B->cmap->n; 1943 } else { 1944 k = B->rmap->n; 1945 } 1946 1947 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 1948 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 1949 csrmat->num_entries,mat->alpha_one,mat->descr, 1950 csrmat->values->data().get(), 1951 csrmat->row_offsets->data().get(), 1952 csrmat->column_indices->data().get(), 1953 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 1954 carray,clda);CHKERRCUSPARSE(stat); 1955 #endif 1956 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1957 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1958 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 1959 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 1960 if (product->type == MATPRODUCT_RARt) { 1961 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1962 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1963 } else if (product->type == MATPRODUCT_PtAP) { 1964 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1965 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1966 } else { 1967 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 1968 } 1969 if (mmdata->cisdense) { 1970 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 1971 } 1972 if (!biscuda) { 1973 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 1974 } 1975 PetscFunctionReturn(0); 1976 } 1977 1978 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1979 { 1980 Mat_Product *product = C->product; 1981 Mat A,B; 1982 PetscInt m,n; 1983 PetscBool cisdense,flg; 1984 PetscErrorCode ierr; 1985 MatMatCusparse *mmdata; 1986 Mat_SeqAIJCUSPARSE *cusp; 1987 1988 PetscFunctionBegin; 1989 MatCheckProduct(C,1); 1990 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 1991 A = product->A; 1992 B = product->B; 1993 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1994 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1995 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1996 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 1997 switch (product->type) { 1998 case MATPRODUCT_AB: 1999 m = A->rmap->n; 2000 n = B->cmap->n; 2001 break; 2002 case MATPRODUCT_AtB: 2003 m = A->cmap->n; 2004 n = B->cmap->n; 2005 break; 2006 case MATPRODUCT_ABt: 2007 m = A->rmap->n; 2008 n = B->rmap->n; 2009 break; 2010 case MATPRODUCT_PtAP: 2011 m = B->cmap->n; 2012 n = B->cmap->n; 2013 break; 2014 case MATPRODUCT_RARt: 2015 m = B->rmap->n; 2016 n = B->rmap->n; 2017 break; 2018 default: 2019 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2020 } 2021 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2022 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2023 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2024 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2025 2026 /* product data */ 2027 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2028 mmdata->cisdense = cisdense; 2029 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2030 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2031 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2032 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2033 } 2034 #endif 2035 /* for these products we need intermediate storage */ 2036 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2037 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2038 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2039 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2040 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2041 } else { 2042 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2043 } 2044 } 2045 C->product->data = mmdata; 2046 C->product->destroy = MatDestroy_MatMatCusparse; 2047 2048 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2049 PetscFunctionReturn(0); 2050 } 2051 2052 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2053 2054 /* handles dense B */ 2055 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C) 2056 { 2057 Mat_Product *product = C->product; 2058 PetscErrorCode ierr; 2059 2060 PetscFunctionBegin; 2061 MatCheckProduct(C,1); 2062 if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 2063 if (product->A->boundtocpu) { 2064 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr); 2065 PetscFunctionReturn(0); 2066 } 2067 switch (product->type) { 2068 case MATPRODUCT_AB: 2069 case MATPRODUCT_AtB: 2070 case MATPRODUCT_ABt: 2071 case MATPRODUCT_PtAP: 2072 case MATPRODUCT_RARt: 2073 C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2074 default: 2075 break; 2076 } 2077 PetscFunctionReturn(0); 2078 } 2079 2080 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2081 { 2082 PetscErrorCode ierr; 2083 2084 PetscFunctionBegin; 2085 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2086 PetscFunctionReturn(0); 2087 } 2088 2089 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2090 { 2091 PetscErrorCode ierr; 2092 2093 PetscFunctionBegin; 2094 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2095 PetscFunctionReturn(0); 2096 } 2097 2098 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2099 { 2100 PetscErrorCode ierr; 2101 2102 PetscFunctionBegin; 2103 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2104 PetscFunctionReturn(0); 2105 } 2106 2107 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2108 { 2109 PetscErrorCode ierr; 2110 2111 PetscFunctionBegin; 2112 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2113 PetscFunctionReturn(0); 2114 } 2115 2116 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2117 { 2118 PetscErrorCode ierr; 2119 2120 PetscFunctionBegin; 2121 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2122 PetscFunctionReturn(0); 2123 } 2124 2125 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2126 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2127 { 2128 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2129 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2130 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2131 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2132 PetscErrorCode ierr; 2133 cudaError_t cerr; 2134 cusparseStatus_t stat; 2135 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2136 PetscBool compressed; 2137 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2138 PetscInt nx,ny; 2139 #endif 2140 2141 PetscFunctionBegin; 2142 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2143 if (!a->nonzerorowcnt) { 2144 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2145 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2146 PetscFunctionReturn(0); 2147 } 2148 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2149 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2150 if (!trans) { 2151 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2152 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2153 } else { 2154 if (herm || !cusparsestruct->transgen) { 2155 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2156 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2157 } else { 2158 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);} 2159 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2160 } 2161 } 2162 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2163 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2164 2165 try { 2166 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2167 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2168 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2169 2170 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2171 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2172 /* z = A x + beta y. 2173 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2174 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2175 */ 2176 xptr = xarray; 2177 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2178 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2179 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2180 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2181 allocated to accommodate different uses. So we get the length info directly from mat. 2182 */ 2183 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2184 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2185 nx = mat->num_cols; 2186 ny = mat->num_rows; 2187 } 2188 #endif 2189 } else { 2190 /* z = A^T x + beta y 2191 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2192 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2193 */ 2194 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2195 dptr = zarray; 2196 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2197 if (compressed) { /* Scatter x to work vector */ 2198 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2199 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2200 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2201 VecCUDAEqualsReverse()); 2202 } 2203 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2204 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2205 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2206 nx = mat->num_rows; 2207 ny = mat->num_cols; 2208 } 2209 #endif 2210 } 2211 2212 /* csr_spmv does y = alpha op(A) x + beta y */ 2213 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2214 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2215 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2216 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2217 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2218 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2219 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2220 matstruct->matDescr, 2221 matstruct->cuSpMV[opA].vecXDescr, beta, 2222 matstruct->cuSpMV[opA].vecYDescr, 2223 cusparse_scalartype, 2224 cusparsestruct->spmvAlg, 2225 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2226 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2227 2228 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2229 } else { 2230 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2231 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2232 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2233 } 2234 2235 stat = cusparseSpMV(cusparsestruct->handle, opA, 2236 matstruct->alpha_one, 2237 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */ 2238 matstruct->cuSpMV[opA].vecXDescr, 2239 beta, 2240 matstruct->cuSpMV[opA].vecYDescr, 2241 cusparse_scalartype, 2242 cusparsestruct->spmvAlg, 2243 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2244 #else 2245 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2246 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2247 mat->num_rows, mat->num_cols, 2248 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2249 mat->values->data().get(), mat->row_offsets->data().get(), 2250 mat->column_indices->data().get(), xptr, beta, 2251 dptr);CHKERRCUSPARSE(stat); 2252 #endif 2253 } else { 2254 if (cusparsestruct->nrows) { 2255 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2256 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2257 #else 2258 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2259 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2260 matstruct->alpha_one, matstruct->descr, hybMat, 2261 xptr, beta, 2262 dptr);CHKERRCUSPARSE(stat); 2263 #endif 2264 } 2265 } 2266 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2267 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2268 2269 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2270 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2271 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2272 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2273 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2274 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2275 } 2276 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2277 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2278 } 2279 2280 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2281 if (compressed) { 2282 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2283 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2284 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2285 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2286 VecCUDAPlusEquals()); 2287 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2288 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2289 } 2290 } else { 2291 if (yy && yy != zz) { 2292 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2293 } 2294 } 2295 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2296 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2297 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2298 } catch(char *ex) { 2299 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2300 } 2301 if (yy) { 2302 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2303 } else { 2304 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2305 } 2306 PetscFunctionReturn(0); 2307 } 2308 2309 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2310 { 2311 PetscErrorCode ierr; 2312 2313 PetscFunctionBegin; 2314 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2315 PetscFunctionReturn(0); 2316 } 2317 2318 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2319 { 2320 PetscErrorCode ierr; 2321 PetscSplitCSRDataStructure *d_mat = NULL, h_mat; 2322 PetscBool is_seq = PETSC_TRUE; 2323 PetscInt nnz_state = A->nonzerostate; 2324 PetscFunctionBegin; 2325 if (A->factortype == MAT_FACTOR_NONE) { 2326 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2327 } 2328 if (d_mat) { 2329 cudaError_t err; 2330 ierr = PetscInfo(A,"Assemble device matrix\n");CHKERRQ(ierr); 2331 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2332 nnz_state = h_mat.nonzerostate; 2333 is_seq = h_mat.seq; 2334 } 2335 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2336 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2337 if (A->factortype == MAT_FACTOR_NONE && A->nonzerostate >= nnz_state && is_seq) { // assembled on CPU eventhough equiped for GPU 2338 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2339 } else if (nnz_state > A->nonzerostate) { 2340 A->offloadmask = PETSC_OFFLOAD_GPU; 2341 } 2342 2343 PetscFunctionReturn(0); 2344 } 2345 2346 /* --------------------------------------------------------------------------------*/ 2347 /*@ 2348 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2349 (the default parallel PETSc format). This matrix will ultimately pushed down 2350 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2351 assembly performance the user should preallocate the matrix storage by setting 2352 the parameter nz (or the array nnz). By setting these parameters accurately, 2353 performance during matrix assembly can be increased by more than a factor of 50. 2354 2355 Collective 2356 2357 Input Parameters: 2358 + comm - MPI communicator, set to PETSC_COMM_SELF 2359 . m - number of rows 2360 . n - number of columns 2361 . nz - number of nonzeros per row (same for all rows) 2362 - nnz - array containing the number of nonzeros in the various rows 2363 (possibly different for each row) or NULL 2364 2365 Output Parameter: 2366 . A - the matrix 2367 2368 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2369 MatXXXXSetPreallocation() paradgm instead of this routine directly. 2370 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 2371 2372 Notes: 2373 If nnz is given then nz is ignored 2374 2375 The AIJ format (also called the Yale sparse matrix format or 2376 compressed row storage), is fully compatible with standard Fortran 77 2377 storage. That is, the stored row and column indices can begin at 2378 either one (as in Fortran) or zero. See the users' manual for details. 2379 2380 Specify the preallocated storage with either nz or nnz (not both). 2381 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 2382 allocation. For large problems you MUST preallocate memory or you 2383 will get TERRIBLE performance, see the users' manual chapter on matrices. 2384 2385 By default, this format uses inodes (identical nodes) when possible, to 2386 improve numerical efficiency of matrix-vector products and solves. We 2387 search for consecutive rows with the same nonzero structure, thereby 2388 reusing matrix information to achieve increased efficiency. 2389 2390 Level: intermediate 2391 2392 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 2393 @*/ 2394 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 2395 { 2396 PetscErrorCode ierr; 2397 2398 PetscFunctionBegin; 2399 ierr = MatCreate(comm,A);CHKERRQ(ierr); 2400 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 2401 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2402 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 2403 PetscFunctionReturn(0); 2404 } 2405 2406 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 2407 { 2408 PetscErrorCode ierr; 2409 PetscSplitCSRDataStructure *d_mat = NULL; 2410 2411 PetscFunctionBegin; 2412 if (A->factortype == MAT_FACTOR_NONE) { 2413 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2414 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 2415 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 2416 } else { 2417 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 2418 } 2419 if (d_mat) { 2420 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2421 cudaError_t err; 2422 PetscSplitCSRDataStructure h_mat; 2423 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 2424 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2425 if (h_mat.seq) { 2426 if (a->compressedrow.use) { 2427 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 2428 } 2429 err = cudaFree(d_mat);CHKERRCUDA(err); 2430 } 2431 } 2432 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 2433 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2434 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 2435 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 2436 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 2437 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 2438 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 2439 PetscFunctionReturn(0); 2440 } 2441 2442 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 2443 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 2444 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 2445 { 2446 PetscErrorCode ierr; 2447 2448 PetscFunctionBegin; 2449 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 2450 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 2451 PetscFunctionReturn(0); 2452 } 2453 2454 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 2455 { 2456 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2457 PetscErrorCode ierr; 2458 2459 PetscFunctionBegin; 2460 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 2461 if (flg) { 2462 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 2463 2464 A->ops->mult = MatMult_SeqAIJ; 2465 A->ops->multadd = MatMultAdd_SeqAIJ; 2466 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 2467 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 2468 A->ops->multhermitiantranspose = NULL; 2469 A->ops->multhermitiantransposeadd = NULL; 2470 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2471 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 2472 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 2473 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 2474 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 2475 } else { 2476 A->ops->mult = MatMult_SeqAIJCUSPARSE; 2477 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 2478 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 2479 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 2480 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 2481 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 2482 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2483 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2484 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 2485 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 2486 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 2487 } 2488 A->boundtocpu = flg; 2489 a->inode.use = flg; 2490 PetscFunctionReturn(0); 2491 } 2492 2493 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 2494 { 2495 PetscSplitCSRDataStructure *d_mat = NULL; 2496 PetscErrorCode ierr; 2497 PetscBool both = PETSC_FALSE; 2498 2499 PetscFunctionBegin; 2500 if (A->factortype == MAT_FACTOR_NONE) { 2501 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 2502 if (spptr->mat) { 2503 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 2504 if (matrix->values) { 2505 both = PETSC_TRUE; 2506 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 2507 } 2508 } 2509 if (spptr->matTranspose) { 2510 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 2511 if (matrix->values) { 2512 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 2513 } 2514 } 2515 d_mat = spptr->deviceMat; 2516 } 2517 if (d_mat) { 2518 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2519 PetscInt n = A->rmap->n, nnz = a->i[n]; 2520 cudaError_t err; 2521 PetscScalar *vals; 2522 ierr = PetscInfo(A,"Zero device matrix\n");CHKERRQ(ierr); 2523 err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2524 err = cudaMemset( vals, 0, (nnz)*sizeof(PetscScalar));CHKERRCUDA(err); 2525 } 2526 ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 2527 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2528 2529 PetscFunctionReturn(0); 2530 } 2531 2532 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 2533 { 2534 PetscErrorCode ierr; 2535 cusparseStatus_t stat; 2536 Mat B; 2537 2538 PetscFunctionBegin; 2539 if (reuse == MAT_INITIAL_MATRIX) { 2540 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 2541 } else if (reuse == MAT_REUSE_MATRIX) { 2542 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 2543 } 2544 B = *newmat; 2545 2546 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 2547 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 2548 2549 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 2550 if (B->factortype == MAT_FACTOR_NONE) { 2551 Mat_SeqAIJCUSPARSE *spptr; 2552 2553 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2554 spptr->format = MAT_CUSPARSE_CSR; 2555 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2556 B->spptr = spptr; 2557 spptr->deviceMat = NULL; 2558 } else { 2559 Mat_SeqAIJCUSPARSETriFactors *spptr; 2560 2561 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2562 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2563 B->spptr = spptr; 2564 } 2565 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2566 } 2567 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 2568 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 2569 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 2570 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 2571 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 2572 B->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 2573 2574 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 2575 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2576 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 2577 PetscFunctionReturn(0); 2578 } 2579 2580 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 2581 { 2582 PetscErrorCode ierr; 2583 2584 PetscFunctionBegin; 2585 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 2586 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 2587 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2588 ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr); 2589 ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr); 2590 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2591 PetscFunctionReturn(0); 2592 } 2593 2594 /*MC 2595 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 2596 2597 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 2598 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 2599 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 2600 2601 Options Database Keys: 2602 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 2603 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2604 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2605 2606 Level: beginner 2607 2608 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 2609 M*/ 2610 2611 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 2612 2613 2614 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 2615 { 2616 PetscErrorCode ierr; 2617 2618 PetscFunctionBegin; 2619 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2620 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2621 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2622 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2623 PetscFunctionReturn(0); 2624 } 2625 2626 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 2627 { 2628 PetscErrorCode ierr; 2629 cusparseStatus_t stat; 2630 2631 PetscFunctionBegin; 2632 if (*cusparsestruct) { 2633 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 2634 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 2635 delete (*cusparsestruct)->workVector; 2636 delete (*cusparsestruct)->rowoffsets_gpu; 2637 delete (*cusparsestruct)->cooPerm; 2638 delete (*cusparsestruct)->cooPerm_a; 2639 delete (*cusparsestruct)->cooPerm_v; 2640 delete (*cusparsestruct)->cooPerm_w; 2641 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 2642 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2643 cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr); 2644 #endif 2645 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 2646 } 2647 PetscFunctionReturn(0); 2648 } 2649 2650 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 2651 { 2652 PetscFunctionBegin; 2653 if (*mat) { 2654 delete (*mat)->values; 2655 delete (*mat)->column_indices; 2656 delete (*mat)->row_offsets; 2657 delete *mat; 2658 *mat = 0; 2659 } 2660 PetscFunctionReturn(0); 2661 } 2662 2663 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 2664 { 2665 cusparseStatus_t stat; 2666 PetscErrorCode ierr; 2667 2668 PetscFunctionBegin; 2669 if (*trifactor) { 2670 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 2671 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 2672 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 2673 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 2674 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2675 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 2676 #endif 2677 delete *trifactor; 2678 *trifactor = NULL; 2679 } 2680 PetscFunctionReturn(0); 2681 } 2682 2683 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 2684 { 2685 CsrMatrix *mat; 2686 cusparseStatus_t stat; 2687 cudaError_t err; 2688 2689 PetscFunctionBegin; 2690 if (*matstruct) { 2691 if ((*matstruct)->mat) { 2692 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 2693 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2694 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2695 #else 2696 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 2697 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 2698 #endif 2699 } else { 2700 mat = (CsrMatrix*)(*matstruct)->mat; 2701 CsrMatrix_Destroy(&mat); 2702 } 2703 } 2704 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 2705 delete (*matstruct)->cprowIndices; 2706 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 2707 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 2708 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 2709 2710 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2711 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 2712 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 2713 for (int i=0; i<3; i++) { 2714 if (mdata->cuSpMV[i].initialized) { 2715 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 2716 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 2717 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 2718 } 2719 } 2720 #endif 2721 delete *matstruct; 2722 *matstruct = NULL; 2723 } 2724 PetscFunctionReturn(0); 2725 } 2726 2727 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2728 { 2729 PetscErrorCode ierr; 2730 2731 PetscFunctionBegin; 2732 if (*trifactors) { 2733 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 2734 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 2735 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 2736 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 2737 delete (*trifactors)->rpermIndices; 2738 delete (*trifactors)->cpermIndices; 2739 delete (*trifactors)->workVector; 2740 (*trifactors)->rpermIndices = NULL; 2741 (*trifactors)->cpermIndices = NULL; 2742 (*trifactors)->workVector = NULL; 2743 } 2744 PetscFunctionReturn(0); 2745 } 2746 2747 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2748 { 2749 PetscErrorCode ierr; 2750 cusparseHandle_t handle; 2751 cusparseStatus_t stat; 2752 2753 PetscFunctionBegin; 2754 if (*trifactors) { 2755 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 2756 if (handle = (*trifactors)->handle) { 2757 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 2758 } 2759 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 2760 } 2761 PetscFunctionReturn(0); 2762 } 2763 2764 struct IJCompare 2765 { 2766 __host__ __device__ 2767 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 2768 { 2769 if (t1.get<0>() < t2.get<0>()) return true; 2770 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 2771 return false; 2772 } 2773 }; 2774 2775 struct IJEqual 2776 { 2777 __host__ __device__ 2778 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 2779 { 2780 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 2781 return true; 2782 } 2783 }; 2784 2785 struct IJDiff 2786 { 2787 __host__ __device__ 2788 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 2789 { 2790 return t1 == t2 ? 0 : 1; 2791 } 2792 }; 2793 2794 struct IJSum 2795 { 2796 __host__ __device__ 2797 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 2798 { 2799 return t1||t2; 2800 } 2801 }; 2802 2803 #include <thrust/iterator/discard_iterator.h> 2804 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 2805 { 2806 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2807 CsrMatrix *matrix; 2808 PetscErrorCode ierr; 2809 cudaError_t cerr; 2810 PetscInt n; 2811 2812 PetscFunctionBegin; 2813 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 2814 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 2815 if (!cusp->cooPerm) { 2816 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2817 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2818 PetscFunctionReturn(0); 2819 } 2820 n = cusp->cooPerm->size(); 2821 matrix = (CsrMatrix*)cusp->mat->mat; 2822 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 2823 if (!cusp->cooPerm_v) { cusp->cooPerm_v = new THRUSTARRAY(n); } 2824 ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 2825 if (v) { 2826 cusp->cooPerm_v->assign(v,v+n); 2827 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 2828 } 2829 else thrust::fill(thrust::device,cusp->cooPerm_v->begin(),cusp->cooPerm_v->end(),0.); 2830 if (imode == ADD_VALUES) { 2831 if (cusp->cooPerm_a) { 2832 if (!cusp->cooPerm_w) cusp->cooPerm_w = new THRUSTARRAY(matrix->values->size()); 2833 auto vbit = thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()); 2834 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cusp->cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 2835 thrust::transform(cusp->cooPerm_w->begin(),cusp->cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 2836 } else { 2837 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 2838 matrix->values->begin())); 2839 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 2840 matrix->values->end())); 2841 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 2842 } 2843 } else { 2844 if (cusp->cooPerm_a) { /* non unique values insertion, result is undefined (we cannot guarantee last takes precedence) 2845 if we are inserting two different values into the same location */ 2846 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 2847 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->begin()))); 2848 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 2849 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->end()))); 2850 thrust::for_each(zibit,zieit,VecCUDAEquals()); 2851 } else { 2852 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 2853 matrix->values->begin())); 2854 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 2855 matrix->values->end())); 2856 thrust::for_each(zibit,zieit,VecCUDAEquals()); 2857 } 2858 } 2859 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2860 ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 2861 A->offloadmask = PETSC_OFFLOAD_GPU; 2862 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2863 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2864 /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */ 2865 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 2866 PetscFunctionReturn(0); 2867 } 2868 2869 #include <thrust/binary_search.h> 2870 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 2871 { 2872 PetscErrorCode ierr; 2873 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2874 CsrMatrix *matrix; 2875 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2876 PetscInt cooPerm_n, nzr = 0; 2877 cudaError_t cerr; 2878 2879 PetscFunctionBegin; 2880 ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr); 2881 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 2882 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 2883 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 2884 if (n != cooPerm_n) { 2885 delete cusp->cooPerm; 2886 delete cusp->cooPerm_v; 2887 delete cusp->cooPerm_w; 2888 delete cusp->cooPerm_a; 2889 cusp->cooPerm = NULL; 2890 cusp->cooPerm_v = NULL; 2891 cusp->cooPerm_w = NULL; 2892 cusp->cooPerm_a = NULL; 2893 } 2894 if (n) { 2895 THRUSTINTARRAY d_i(n); 2896 THRUSTINTARRAY d_j(n); 2897 THRUSTINTARRAY ii(A->rmap->n); 2898 2899 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 2900 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 2901 2902 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 2903 d_i.assign(coo_i,coo_i+n); 2904 d_j.assign(coo_j,coo_j+n); 2905 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 2906 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 2907 2908 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 2909 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 2910 *cusp->cooPerm_a = d_i; 2911 THRUSTINTARRAY w = d_j; 2912 2913 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 2914 if (nekey == ekey) { /* all entries are unique */ 2915 delete cusp->cooPerm_a; 2916 cusp->cooPerm_a = NULL; 2917 } else { /* I couldn't come up with a more elegant algorithm */ 2918 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 2919 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 2920 (*cusp->cooPerm_a)[0] = 0; 2921 w[0] = 0; 2922 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 2923 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 2924 } 2925 thrust::counting_iterator<PetscInt> search_begin(0); 2926 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 2927 search_begin, search_begin + A->rmap->n, 2928 ii.begin()); 2929 2930 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 2931 a->singlemalloc = PETSC_FALSE; 2932 a->free_a = PETSC_TRUE; 2933 a->free_ij = PETSC_TRUE; 2934 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 2935 a->i[0] = 0; 2936 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2937 a->nz = a->maxnz = a->i[A->rmap->n]; 2938 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 2939 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 2940 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2941 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 2942 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 2943 for (PetscInt i = 0; i < A->rmap->n; i++) { 2944 const PetscInt nnzr = a->i[i+1] - a->i[i]; 2945 nzr += (PetscInt)!!(nnzr); 2946 a->ilen[i] = a->imax[i] = nnzr; 2947 } 2948 A->preallocated = PETSC_TRUE; 2949 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 2950 } else { 2951 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 2952 } 2953 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2954 ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr); 2955 2956 /* We want to allocate the CUSPARSE struct for matvec now. 2957 The code is so convoluted now that I prefer to copy garbage to the GPU */ 2958 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 2959 A->offloadmask = PETSC_OFFLOAD_CPU; 2960 A->nonzerostate++; 2961 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2962 { 2963 matrix = (CsrMatrix*)cusp->mat->mat; 2964 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 2965 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 2966 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 2967 } 2968 2969 A->offloadmask = PETSC_OFFLOAD_CPU; 2970 A->assembled = PETSC_FALSE; 2971 A->was_assembled = PETSC_FALSE; 2972 PetscFunctionReturn(0); 2973 } 2974