1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 68 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 69 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 74 75 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 76 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 78 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 80 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 81 82 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 83 { 84 cusparseStatus_t stat; 85 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 86 87 PetscFunctionBegin; 88 cusparsestruct->stream = stream; 89 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 90 PetscFunctionReturn(0); 91 } 92 93 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 94 { 95 cusparseStatus_t stat; 96 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 97 98 PetscFunctionBegin; 99 if (cusparsestruct->handle != handle) { 100 if (cusparsestruct->handle) { 101 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 102 } 103 cusparsestruct->handle = handle; 104 } 105 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 106 PetscFunctionReturn(0); 107 } 108 109 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 110 { 111 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 112 113 PetscFunctionBegin; 114 if (cusparsestruct->handle) cusparsestruct->handle = 0; 115 PetscFunctionReturn(0); 116 } 117 118 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 119 { 120 PetscFunctionBegin; 121 *type = MATSOLVERCUSPARSE; 122 PetscFunctionReturn(0); 123 } 124 125 /*MC 126 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 127 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 128 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 129 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 130 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 131 algorithms are not recommended. This class does NOT support direct solver operations. 132 133 Level: beginner 134 135 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 136 M*/ 137 138 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 139 { 140 PetscErrorCode ierr; 141 PetscInt n = A->rmap->n; 142 143 PetscFunctionBegin; 144 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 145 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 146 (*B)->factortype = ftype; 147 (*B)->useordering = PETSC_TRUE; 148 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 149 150 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 151 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 152 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 153 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 154 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 155 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 156 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 157 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 158 159 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 160 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 161 PetscFunctionReturn(0); 162 } 163 164 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 165 { 166 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 167 168 PetscFunctionBegin; 169 switch (op) { 170 case MAT_CUSPARSE_MULT: 171 cusparsestruct->format = format; 172 break; 173 case MAT_CUSPARSE_ALL: 174 cusparsestruct->format = format; 175 break; 176 default: 177 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 178 } 179 PetscFunctionReturn(0); 180 } 181 182 /*@ 183 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 184 operation. Only the MatMult operation can use different GPU storage formats 185 for MPIAIJCUSPARSE matrices. 186 Not Collective 187 188 Input Parameters: 189 + A - Matrix of type SEQAIJCUSPARSE 190 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 191 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 192 193 Output Parameter: 194 195 Level: intermediate 196 197 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 198 @*/ 199 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 200 { 201 PetscErrorCode ierr; 202 203 PetscFunctionBegin; 204 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 205 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 206 PetscFunctionReturn(0); 207 } 208 209 /*@ 210 MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose 211 212 Collective on mat 213 214 Input Parameters: 215 + A - Matrix of type SEQAIJCUSPARSE 216 - transgen - the boolean flag 217 218 Level: intermediate 219 220 .seealso: MATSEQAIJCUSPARSE 221 @*/ 222 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 223 { 224 PetscErrorCode ierr; 225 PetscBool flg; 226 227 PetscFunctionBegin; 228 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 229 ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 230 if (flg) { 231 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 232 233 if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 234 cusp->transgen = transgen; 235 if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 236 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 237 } 238 } 239 PetscFunctionReturn(0); 240 } 241 242 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 243 { 244 PetscErrorCode ierr; 245 MatCUSPARSEStorageFormat format; 246 PetscBool flg; 247 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 248 249 PetscFunctionBegin; 250 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 251 if (A->factortype == MAT_FACTOR_NONE) { 252 PetscBool transgen = cusparsestruct->transgen; 253 254 ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 255 if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);} 256 257 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 258 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 259 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 260 261 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 262 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 263 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 264 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 265 cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 266 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 267 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 268 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 269 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 270 271 cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 272 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 273 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 274 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 275 276 cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 277 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 278 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 279 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 280 #endif 281 } 282 ierr = PetscOptionsTail();CHKERRQ(ierr); 283 PetscFunctionReturn(0); 284 } 285 286 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 287 { 288 PetscErrorCode ierr; 289 290 PetscFunctionBegin; 291 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 292 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 293 PetscFunctionReturn(0); 294 } 295 296 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 297 { 298 PetscErrorCode ierr; 299 300 PetscFunctionBegin; 301 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 302 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 303 PetscFunctionReturn(0); 304 } 305 306 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 307 { 308 PetscErrorCode ierr; 309 310 PetscFunctionBegin; 311 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 312 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 313 PetscFunctionReturn(0); 314 } 315 316 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 317 { 318 PetscErrorCode ierr; 319 320 PetscFunctionBegin; 321 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 322 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 323 PetscFunctionReturn(0); 324 } 325 326 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 327 { 328 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 329 PetscInt n = A->rmap->n; 330 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 331 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 332 cusparseStatus_t stat; 333 const PetscInt *ai = a->i,*aj = a->j,*vi; 334 const MatScalar *aa = a->a,*v; 335 PetscInt *AiLo, *AjLo; 336 PetscScalar *AALo; 337 PetscInt i,nz, nzLower, offset, rowOffset; 338 PetscErrorCode ierr; 339 cudaError_t cerr; 340 341 PetscFunctionBegin; 342 if (!n) PetscFunctionReturn(0); 343 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 344 try { 345 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 346 nzLower=n+ai[n]-ai[1]; 347 348 /* Allocate Space for the lower triangular matrix */ 349 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 350 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 351 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 352 353 /* Fill the lower triangular matrix */ 354 AiLo[0] = (PetscInt) 0; 355 AiLo[n] = nzLower; 356 AjLo[0] = (PetscInt) 0; 357 AALo[0] = (MatScalar) 1.0; 358 v = aa; 359 vi = aj; 360 offset = 1; 361 rowOffset= 1; 362 for (i=1; i<n; i++) { 363 nz = ai[i+1] - ai[i]; 364 /* additional 1 for the term on the diagonal */ 365 AiLo[i] = rowOffset; 366 rowOffset += nz+1; 367 368 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 369 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 370 371 offset += nz; 372 AjLo[offset] = (PetscInt) i; 373 AALo[offset] = (MatScalar) 1.0; 374 offset += 1; 375 376 v += nz; 377 vi += nz; 378 } 379 380 /* allocate space for the triangular factor information */ 381 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 382 383 /* Create the matrix description */ 384 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 385 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 386 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 387 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 388 #else 389 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 390 #endif 391 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 392 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 393 394 /* set the operation */ 395 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 396 397 /* set the matrix */ 398 loTriFactor->csrMat = new CsrMatrix; 399 loTriFactor->csrMat->num_rows = n; 400 loTriFactor->csrMat->num_cols = n; 401 loTriFactor->csrMat->num_entries = nzLower; 402 403 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 404 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 405 406 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 407 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 408 409 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 410 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 411 412 /* Create the solve analysis information */ 413 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 414 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 415 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 416 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 417 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 418 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 419 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 420 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 421 #endif 422 423 /* perform the solve analysis */ 424 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 425 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 426 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 427 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 428 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 429 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 430 #endif 431 );CHKERRCUSPARSE(stat); 432 433 /* assign the pointer. Is this really necessary? */ 434 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 435 436 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 437 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 438 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 439 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 440 } catch(char *ex) { 441 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 442 } 443 } 444 PetscFunctionReturn(0); 445 } 446 447 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 448 { 449 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 450 PetscInt n = A->rmap->n; 451 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 452 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 453 cusparseStatus_t stat; 454 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 455 const MatScalar *aa = a->a,*v; 456 PetscInt *AiUp, *AjUp; 457 PetscScalar *AAUp; 458 PetscInt i,nz, nzUpper, offset; 459 PetscErrorCode ierr; 460 cudaError_t cerr; 461 462 PetscFunctionBegin; 463 if (!n) PetscFunctionReturn(0); 464 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 465 try { 466 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 467 nzUpper = adiag[0]-adiag[n]; 468 469 /* Allocate Space for the upper triangular matrix */ 470 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 471 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 472 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 473 474 /* Fill the upper triangular matrix */ 475 AiUp[0]=(PetscInt) 0; 476 AiUp[n]=nzUpper; 477 offset = nzUpper; 478 for (i=n-1; i>=0; i--) { 479 v = aa + adiag[i+1] + 1; 480 vi = aj + adiag[i+1] + 1; 481 482 /* number of elements NOT on the diagonal */ 483 nz = adiag[i] - adiag[i+1]-1; 484 485 /* decrement the offset */ 486 offset -= (nz+1); 487 488 /* first, set the diagonal elements */ 489 AjUp[offset] = (PetscInt) i; 490 AAUp[offset] = (MatScalar)1./v[nz]; 491 AiUp[i] = AiUp[i+1] - (nz+1); 492 493 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 494 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 495 } 496 497 /* allocate space for the triangular factor information */ 498 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 499 500 /* Create the matrix description */ 501 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 502 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 503 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 504 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 505 #else 506 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 507 #endif 508 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 509 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 510 511 /* set the operation */ 512 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 513 514 /* set the matrix */ 515 upTriFactor->csrMat = new CsrMatrix; 516 upTriFactor->csrMat->num_rows = n; 517 upTriFactor->csrMat->num_cols = n; 518 upTriFactor->csrMat->num_entries = nzUpper; 519 520 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 521 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 522 523 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 524 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 525 526 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 527 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 528 529 /* Create the solve analysis information */ 530 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 531 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 532 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 533 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 534 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 535 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 536 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 537 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 538 #endif 539 540 /* perform the solve analysis */ 541 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 542 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 543 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 544 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 545 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 546 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 547 #endif 548 );CHKERRCUSPARSE(stat); 549 550 /* assign the pointer. Is this really necessary? */ 551 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 552 553 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 554 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 555 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 556 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 557 } catch(char *ex) { 558 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 559 } 560 } 561 PetscFunctionReturn(0); 562 } 563 564 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 565 { 566 PetscErrorCode ierr; 567 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 568 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 569 IS isrow = a->row,iscol = a->icol; 570 PetscBool row_identity,col_identity; 571 const PetscInt *r,*c; 572 PetscInt n = A->rmap->n; 573 574 PetscFunctionBegin; 575 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 576 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 577 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 578 579 cusparseTriFactors->workVector = new THRUSTARRAY(n); 580 cusparseTriFactors->nnz=a->nz; 581 582 A->offloadmask = PETSC_OFFLOAD_BOTH; 583 /* lower triangular indices */ 584 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 585 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 586 if (!row_identity) { 587 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 588 cusparseTriFactors->rpermIndices->assign(r, r+n); 589 } 590 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 591 592 /* upper triangular indices */ 593 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 594 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 595 if (!col_identity) { 596 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 597 cusparseTriFactors->cpermIndices->assign(c, c+n); 598 } 599 600 if (!row_identity && !col_identity) { 601 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 602 } else if(!row_identity) { 603 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 604 } else if(!col_identity) { 605 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 606 } 607 608 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 609 PetscFunctionReturn(0); 610 } 611 612 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 613 { 614 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 615 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 616 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 617 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 618 cusparseStatus_t stat; 619 PetscErrorCode ierr; 620 cudaError_t cerr; 621 PetscInt *AiUp, *AjUp; 622 PetscScalar *AAUp; 623 PetscScalar *AALo; 624 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 625 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 626 const PetscInt *ai = b->i,*aj = b->j,*vj; 627 const MatScalar *aa = b->a,*v; 628 629 PetscFunctionBegin; 630 if (!n) PetscFunctionReturn(0); 631 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 632 try { 633 /* Allocate Space for the upper triangular matrix */ 634 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 635 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 636 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 637 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 638 639 /* Fill the upper triangular matrix */ 640 AiUp[0]=(PetscInt) 0; 641 AiUp[n]=nzUpper; 642 offset = 0; 643 for (i=0; i<n; i++) { 644 /* set the pointers */ 645 v = aa + ai[i]; 646 vj = aj + ai[i]; 647 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 648 649 /* first, set the diagonal elements */ 650 AjUp[offset] = (PetscInt) i; 651 AAUp[offset] = (MatScalar)1.0/v[nz]; 652 AiUp[i] = offset; 653 AALo[offset] = (MatScalar)1.0/v[nz]; 654 655 offset+=1; 656 if (nz>0) { 657 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 658 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 659 for (j=offset; j<offset+nz; j++) { 660 AAUp[j] = -AAUp[j]; 661 AALo[j] = AAUp[j]/v[nz]; 662 } 663 offset+=nz; 664 } 665 } 666 667 /* allocate space for the triangular factor information */ 668 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 669 670 /* Create the matrix description */ 671 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 672 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 673 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 674 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 675 #else 676 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 677 #endif 678 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 679 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 680 681 /* set the matrix */ 682 upTriFactor->csrMat = new CsrMatrix; 683 upTriFactor->csrMat->num_rows = A->rmap->n; 684 upTriFactor->csrMat->num_cols = A->cmap->n; 685 upTriFactor->csrMat->num_entries = a->nz; 686 687 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 688 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 689 690 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 691 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 692 693 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 694 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 695 696 /* set the operation */ 697 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 698 699 /* Create the solve analysis information */ 700 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 701 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 702 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 703 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 704 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 705 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 706 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 707 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 708 #endif 709 710 /* perform the solve analysis */ 711 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 712 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 713 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 714 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 715 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 716 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 717 #endif 718 );CHKERRCUSPARSE(stat); 719 720 /* assign the pointer. Is this really necessary? */ 721 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 722 723 /* allocate space for the triangular factor information */ 724 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 725 726 /* Create the matrix description */ 727 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 728 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 729 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 730 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 731 #else 732 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 733 #endif 734 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 735 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 736 737 /* set the operation */ 738 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 739 740 /* set the matrix */ 741 loTriFactor->csrMat = new CsrMatrix; 742 loTriFactor->csrMat->num_rows = A->rmap->n; 743 loTriFactor->csrMat->num_cols = A->cmap->n; 744 loTriFactor->csrMat->num_entries = a->nz; 745 746 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 747 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 748 749 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 750 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 751 752 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 753 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 754 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 755 756 /* Create the solve analysis information */ 757 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 758 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 759 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 760 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 761 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 762 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 763 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 764 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 765 #endif 766 767 /* perform the solve analysis */ 768 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 769 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 770 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 771 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 772 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 773 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 774 #endif 775 );CHKERRCUSPARSE(stat); 776 777 /* assign the pointer. Is this really necessary? */ 778 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 779 780 A->offloadmask = PETSC_OFFLOAD_BOTH; 781 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 782 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 783 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 784 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 785 } catch(char *ex) { 786 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 787 } 788 } 789 PetscFunctionReturn(0); 790 } 791 792 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 793 { 794 PetscErrorCode ierr; 795 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 796 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 797 IS ip = a->row; 798 const PetscInt *rip; 799 PetscBool perm_identity; 800 PetscInt n = A->rmap->n; 801 802 PetscFunctionBegin; 803 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 804 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 805 cusparseTriFactors->workVector = new THRUSTARRAY(n); 806 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 807 808 /* lower triangular indices */ 809 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 810 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 811 if (!perm_identity) { 812 IS iip; 813 const PetscInt *irip; 814 815 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 816 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 817 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 818 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 819 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 820 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 821 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 822 ierr = ISDestroy(&iip);CHKERRQ(ierr); 823 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 824 } 825 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 826 PetscFunctionReturn(0); 827 } 828 829 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 830 { 831 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 832 IS isrow = b->row,iscol = b->col; 833 PetscBool row_identity,col_identity; 834 PetscErrorCode ierr; 835 836 PetscFunctionBegin; 837 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 838 B->offloadmask = PETSC_OFFLOAD_CPU; 839 /* determine which version of MatSolve needs to be used. */ 840 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 841 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 842 if (row_identity && col_identity) { 843 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 844 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 845 B->ops->matsolve = NULL; 846 B->ops->matsolvetranspose = NULL; 847 } else { 848 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 849 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 850 B->ops->matsolve = NULL; 851 B->ops->matsolvetranspose = NULL; 852 } 853 854 /* get the triangular factors */ 855 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 856 PetscFunctionReturn(0); 857 } 858 859 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 860 { 861 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 862 IS ip = b->row; 863 PetscBool perm_identity; 864 PetscErrorCode ierr; 865 866 PetscFunctionBegin; 867 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 868 B->offloadmask = PETSC_OFFLOAD_CPU; 869 /* determine which version of MatSolve needs to be used. */ 870 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 871 if (perm_identity) { 872 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 873 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 874 B->ops->matsolve = NULL; 875 B->ops->matsolvetranspose = NULL; 876 } else { 877 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 878 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 879 B->ops->matsolve = NULL; 880 B->ops->matsolvetranspose = NULL; 881 } 882 883 /* get the triangular factors */ 884 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 885 PetscFunctionReturn(0); 886 } 887 888 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 889 { 890 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 891 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 892 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 893 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 894 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 895 cusparseStatus_t stat; 896 cusparseIndexBase_t indexBase; 897 cusparseMatrixType_t matrixType; 898 cusparseFillMode_t fillMode; 899 cusparseDiagType_t diagType; 900 901 PetscFunctionBegin; 902 903 /*********************************************/ 904 /* Now the Transpose of the Lower Tri Factor */ 905 /*********************************************/ 906 907 /* allocate space for the transpose of the lower triangular factor */ 908 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 909 910 /* set the matrix descriptors of the lower triangular factor */ 911 matrixType = cusparseGetMatType(loTriFactor->descr); 912 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 913 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 914 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 915 diagType = cusparseGetMatDiagType(loTriFactor->descr); 916 917 /* Create the matrix description */ 918 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 919 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 920 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 921 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 922 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 923 924 /* set the operation */ 925 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 926 927 /* allocate GPU space for the CSC of the lower triangular factor*/ 928 loTriFactorT->csrMat = new CsrMatrix; 929 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 930 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 931 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 932 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 933 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 934 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 935 936 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 937 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 938 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 939 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 940 loTriFactor->csrMat->values->data().get(), 941 loTriFactor->csrMat->row_offsets->data().get(), 942 loTriFactor->csrMat->column_indices->data().get(), 943 loTriFactorT->csrMat->values->data().get(), 944 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 945 CUSPARSE_ACTION_NUMERIC,indexBase, 946 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 947 cudaError_t cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 948 #endif 949 950 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 951 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 952 loTriFactor->csrMat->values->data().get(), 953 loTriFactor->csrMat->row_offsets->data().get(), 954 loTriFactor->csrMat->column_indices->data().get(), 955 loTriFactorT->csrMat->values->data().get(), 956 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 957 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 958 CUSPARSE_ACTION_NUMERIC, indexBase, 959 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 960 #else 961 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 962 CUSPARSE_ACTION_NUMERIC, indexBase 963 #endif 964 );CHKERRCUSPARSE(stat); 965 966 /* Create the solve analysis information */ 967 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 968 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 969 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 970 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 971 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 972 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 973 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 974 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 975 #endif 976 977 /* perform the solve analysis */ 978 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 979 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 980 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 981 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 982 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 983 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 984 #endif 985 );CHKERRCUSPARSE(stat); 986 987 /* assign the pointer. Is this really necessary? */ 988 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 989 990 /*********************************************/ 991 /* Now the Transpose of the Upper Tri Factor */ 992 /*********************************************/ 993 994 /* allocate space for the transpose of the upper triangular factor */ 995 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 996 997 /* set the matrix descriptors of the upper triangular factor */ 998 matrixType = cusparseGetMatType(upTriFactor->descr); 999 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1000 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1001 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1002 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1003 1004 /* Create the matrix description */ 1005 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1006 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1007 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1008 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1009 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1010 1011 /* set the operation */ 1012 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1013 1014 /* allocate GPU space for the CSC of the upper triangular factor*/ 1015 upTriFactorT->csrMat = new CsrMatrix; 1016 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1017 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1018 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1019 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1020 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1021 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1022 1023 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1024 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1025 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1026 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1027 upTriFactor->csrMat->values->data().get(), 1028 upTriFactor->csrMat->row_offsets->data().get(), 1029 upTriFactor->csrMat->column_indices->data().get(), 1030 upTriFactorT->csrMat->values->data().get(), 1031 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1032 CUSPARSE_ACTION_NUMERIC,indexBase, 1033 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1034 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1035 #endif 1036 1037 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1038 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1039 upTriFactor->csrMat->values->data().get(), 1040 upTriFactor->csrMat->row_offsets->data().get(), 1041 upTriFactor->csrMat->column_indices->data().get(), 1042 upTriFactorT->csrMat->values->data().get(), 1043 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1044 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1045 CUSPARSE_ACTION_NUMERIC, indexBase, 1046 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1047 #else 1048 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1049 CUSPARSE_ACTION_NUMERIC, indexBase 1050 #endif 1051 );CHKERRCUSPARSE(stat); 1052 1053 /* Create the solve analysis information */ 1054 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1055 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1056 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1057 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1058 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1059 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1060 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1061 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1062 #endif 1063 1064 /* perform the solve analysis */ 1065 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1066 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1067 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1068 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1069 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1070 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1071 #endif 1072 );CHKERRCUSPARSE(stat); 1073 1074 /* assign the pointer. Is this really necessary? */ 1075 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1076 PetscFunctionReturn(0); 1077 } 1078 1079 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 1080 { 1081 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1082 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1083 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1084 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1085 cusparseStatus_t stat; 1086 cusparseIndexBase_t indexBase; 1087 cudaError_t err; 1088 PetscErrorCode ierr; 1089 1090 PetscFunctionBegin; 1091 if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0); 1092 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1093 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1094 /* create cusparse matrix */ 1095 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1096 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1097 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1098 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1099 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1100 1101 /* set alpha and beta */ 1102 err = cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar));CHKERRCUDA(err); 1103 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1104 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1105 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1106 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1107 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1108 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1109 1110 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1111 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1112 CsrMatrix *matrixT= new CsrMatrix; 1113 matrixT->num_rows = A->cmap->n; 1114 matrixT->num_cols = A->rmap->n; 1115 matrixT->num_entries = a->nz; 1116 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1117 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1118 matrixT->values = new THRUSTARRAY(a->nz); 1119 1120 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 1121 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1122 1123 /* compute the transpose, i.e. the CSC */ 1124 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1125 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1126 A->cmap->n, matrix->num_entries, 1127 matrix->values->data().get(), 1128 cusparsestruct->rowoffsets_gpu->data().get(), 1129 matrix->column_indices->data().get(), 1130 matrixT->values->data().get(), 1131 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1132 CUSPARSE_ACTION_NUMERIC,indexBase, 1133 cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1134 err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err); 1135 #endif 1136 1137 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1138 A->cmap->n, matrix->num_entries, 1139 matrix->values->data().get(), 1140 cusparsestruct->rowoffsets_gpu->data().get(), 1141 matrix->column_indices->data().get(), 1142 matrixT->values->data().get(), 1143 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1144 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1145 CUSPARSE_ACTION_NUMERIC,indexBase, 1146 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1147 #else 1148 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1149 CUSPARSE_ACTION_NUMERIC, indexBase 1150 #endif 1151 );CHKERRCUSPARSE(stat); 1152 matstructT->mat = matrixT; 1153 1154 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1155 stat = cusparseCreateCsr(&matstructT->matDescr, 1156 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1157 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1158 matrixT->values->data().get(), 1159 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1160 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1161 #endif 1162 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1163 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1164 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1165 #else 1166 CsrMatrix *temp = new CsrMatrix; 1167 CsrMatrix *tempT = new CsrMatrix; 1168 /* First convert HYB to CSR */ 1169 temp->num_rows = A->rmap->n; 1170 temp->num_cols = A->cmap->n; 1171 temp->num_entries = a->nz; 1172 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1173 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1174 temp->values = new THRUSTARRAY(a->nz); 1175 1176 stat = cusparse_hyb2csr(cusparsestruct->handle, 1177 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1178 temp->values->data().get(), 1179 temp->row_offsets->data().get(), 1180 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1181 1182 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1183 tempT->num_rows = A->rmap->n; 1184 tempT->num_cols = A->cmap->n; 1185 tempT->num_entries = a->nz; 1186 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1187 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1188 tempT->values = new THRUSTARRAY(a->nz); 1189 1190 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1191 temp->num_cols, temp->num_entries, 1192 temp->values->data().get(), 1193 temp->row_offsets->data().get(), 1194 temp->column_indices->data().get(), 1195 tempT->values->data().get(), 1196 tempT->column_indices->data().get(), 1197 tempT->row_offsets->data().get(), 1198 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1199 1200 /* Last, convert CSC to HYB */ 1201 cusparseHybMat_t hybMat; 1202 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1203 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1204 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1205 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1206 matstructT->descr, tempT->values->data().get(), 1207 tempT->row_offsets->data().get(), 1208 tempT->column_indices->data().get(), 1209 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1210 1211 /* assign the pointer */ 1212 matstructT->mat = hybMat; 1213 /* delete temporaries */ 1214 if (tempT) { 1215 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1216 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1217 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1218 delete (CsrMatrix*) tempT; 1219 } 1220 if (temp) { 1221 if (temp->values) delete (THRUSTARRAY*) temp->values; 1222 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1223 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1224 delete (CsrMatrix*) temp; 1225 } 1226 #endif 1227 } 1228 err = WaitForCUDA();CHKERRCUDA(err); 1229 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1230 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1231 /* the compressed row indices is not used for matTranspose */ 1232 matstructT->cprowIndices = NULL; 1233 /* assign the pointer */ 1234 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1235 PetscFunctionReturn(0); 1236 } 1237 1238 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1239 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1240 { 1241 PetscInt n = xx->map->n; 1242 const PetscScalar *barray; 1243 PetscScalar *xarray; 1244 thrust::device_ptr<const PetscScalar> bGPU; 1245 thrust::device_ptr<PetscScalar> xGPU; 1246 cusparseStatus_t stat; 1247 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1248 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1249 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1250 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1251 PetscErrorCode ierr; 1252 cudaError_t cerr; 1253 1254 PetscFunctionBegin; 1255 /* Analyze the matrix and create the transpose ... on the fly */ 1256 if (!loTriFactorT && !upTriFactorT) { 1257 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1258 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1259 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1260 } 1261 1262 /* Get the GPU pointers */ 1263 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1264 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1265 xGPU = thrust::device_pointer_cast(xarray); 1266 bGPU = thrust::device_pointer_cast(barray); 1267 1268 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1269 /* First, reorder with the row permutation */ 1270 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1271 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1272 xGPU); 1273 1274 /* First, solve U */ 1275 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1276 upTriFactorT->csrMat->num_rows, 1277 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1278 upTriFactorT->csrMat->num_entries, 1279 #endif 1280 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1281 upTriFactorT->csrMat->values->data().get(), 1282 upTriFactorT->csrMat->row_offsets->data().get(), 1283 upTriFactorT->csrMat->column_indices->data().get(), 1284 upTriFactorT->solveInfo, 1285 xarray, tempGPU->data().get() 1286 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1287 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1288 #endif 1289 );CHKERRCUSPARSE(stat); 1290 1291 /* Then, solve L */ 1292 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1293 loTriFactorT->csrMat->num_rows, 1294 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1295 loTriFactorT->csrMat->num_entries, 1296 #endif 1297 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1298 loTriFactorT->csrMat->values->data().get(), 1299 loTriFactorT->csrMat->row_offsets->data().get(), 1300 loTriFactorT->csrMat->column_indices->data().get(), 1301 loTriFactorT->solveInfo, 1302 tempGPU->data().get(), xarray 1303 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1304 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1305 #endif 1306 );CHKERRCUSPARSE(stat); 1307 1308 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1309 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1310 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1311 tempGPU->begin()); 1312 1313 /* Copy the temporary to the full solution. */ 1314 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1315 1316 /* restore */ 1317 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1318 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1319 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1320 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1321 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1322 PetscFunctionReturn(0); 1323 } 1324 1325 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1326 { 1327 const PetscScalar *barray; 1328 PetscScalar *xarray; 1329 cusparseStatus_t stat; 1330 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1331 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1332 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1333 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1334 PetscErrorCode ierr; 1335 cudaError_t cerr; 1336 1337 PetscFunctionBegin; 1338 /* Analyze the matrix and create the transpose ... on the fly */ 1339 if (!loTriFactorT && !upTriFactorT) { 1340 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1341 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1342 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1343 } 1344 1345 /* Get the GPU pointers */ 1346 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1347 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1348 1349 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1350 /* First, solve U */ 1351 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1352 upTriFactorT->csrMat->num_rows, 1353 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1354 upTriFactorT->csrMat->num_entries, 1355 #endif 1356 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1357 upTriFactorT->csrMat->values->data().get(), 1358 upTriFactorT->csrMat->row_offsets->data().get(), 1359 upTriFactorT->csrMat->column_indices->data().get(), 1360 upTriFactorT->solveInfo, 1361 barray, tempGPU->data().get() 1362 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1363 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1364 #endif 1365 );CHKERRCUSPARSE(stat); 1366 1367 /* Then, solve L */ 1368 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1369 loTriFactorT->csrMat->num_rows, 1370 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1371 loTriFactorT->csrMat->num_entries, 1372 #endif 1373 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1374 loTriFactorT->csrMat->values->data().get(), 1375 loTriFactorT->csrMat->row_offsets->data().get(), 1376 loTriFactorT->csrMat->column_indices->data().get(), 1377 loTriFactorT->solveInfo, 1378 tempGPU->data().get(), xarray 1379 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1380 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1381 #endif 1382 );CHKERRCUSPARSE(stat); 1383 1384 /* restore */ 1385 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1386 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1387 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1388 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1389 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1390 PetscFunctionReturn(0); 1391 } 1392 1393 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1394 { 1395 const PetscScalar *barray; 1396 PetscScalar *xarray; 1397 thrust::device_ptr<const PetscScalar> bGPU; 1398 thrust::device_ptr<PetscScalar> xGPU; 1399 cusparseStatus_t stat; 1400 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1401 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1402 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1403 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1404 PetscErrorCode ierr; 1405 cudaError_t cerr; 1406 1407 PetscFunctionBegin; 1408 1409 /* Get the GPU pointers */ 1410 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1411 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1412 xGPU = thrust::device_pointer_cast(xarray); 1413 bGPU = thrust::device_pointer_cast(barray); 1414 1415 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1416 /* First, reorder with the row permutation */ 1417 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1418 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1419 tempGPU->begin()); 1420 1421 /* Next, solve L */ 1422 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1423 loTriFactor->csrMat->num_rows, 1424 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1425 loTriFactor->csrMat->num_entries, 1426 #endif 1427 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1428 loTriFactor->csrMat->values->data().get(), 1429 loTriFactor->csrMat->row_offsets->data().get(), 1430 loTriFactor->csrMat->column_indices->data().get(), 1431 loTriFactor->solveInfo, 1432 tempGPU->data().get(), xarray 1433 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1434 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1435 #endif 1436 );CHKERRCUSPARSE(stat); 1437 1438 /* Then, solve U */ 1439 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1440 upTriFactor->csrMat->num_rows, 1441 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1442 upTriFactor->csrMat->num_entries, 1443 #endif 1444 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1445 upTriFactor->csrMat->values->data().get(), 1446 upTriFactor->csrMat->row_offsets->data().get(), 1447 upTriFactor->csrMat->column_indices->data().get(), 1448 upTriFactor->solveInfo, 1449 xarray, tempGPU->data().get() 1450 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1451 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1452 #endif 1453 );CHKERRCUSPARSE(stat); 1454 1455 /* Last, reorder with the column permutation */ 1456 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1457 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1458 xGPU); 1459 1460 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1461 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1462 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1463 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1464 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1465 PetscFunctionReturn(0); 1466 } 1467 1468 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1469 { 1470 const PetscScalar *barray; 1471 PetscScalar *xarray; 1472 cusparseStatus_t stat; 1473 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1474 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1475 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1476 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1477 PetscErrorCode ierr; 1478 cudaError_t cerr; 1479 1480 PetscFunctionBegin; 1481 /* Get the GPU pointers */ 1482 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1483 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1484 1485 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1486 /* First, solve L */ 1487 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1488 loTriFactor->csrMat->num_rows, 1489 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1490 loTriFactor->csrMat->num_entries, 1491 #endif 1492 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1493 loTriFactor->csrMat->values->data().get(), 1494 loTriFactor->csrMat->row_offsets->data().get(), 1495 loTriFactor->csrMat->column_indices->data().get(), 1496 loTriFactor->solveInfo, 1497 barray, tempGPU->data().get() 1498 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1499 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1500 #endif 1501 );CHKERRCUSPARSE(stat); 1502 1503 /* Next, solve U */ 1504 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1505 upTriFactor->csrMat->num_rows, 1506 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1507 upTriFactor->csrMat->num_entries, 1508 #endif 1509 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1510 upTriFactor->csrMat->values->data().get(), 1511 upTriFactor->csrMat->row_offsets->data().get(), 1512 upTriFactor->csrMat->column_indices->data().get(), 1513 upTriFactor->solveInfo, 1514 tempGPU->data().get(), xarray 1515 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1516 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1517 #endif 1518 );CHKERRCUSPARSE(stat); 1519 1520 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1521 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1522 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1523 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1524 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1525 PetscFunctionReturn(0); 1526 } 1527 1528 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1529 { 1530 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1531 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1532 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1533 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1534 PetscErrorCode ierr; 1535 cusparseStatus_t stat; 1536 cudaError_t err; 1537 1538 PetscFunctionBegin; 1539 if (A->boundtocpu) PetscFunctionReturn(0); 1540 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1541 if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1542 /* Copy values only */ 1543 CsrMatrix *matrix,*matrixT; 1544 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1545 1546 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1547 matrix->values->assign(a->a, a->a+a->nz); 1548 err = WaitForCUDA();CHKERRCUDA(err); 1549 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1550 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1551 1552 /* Update matT when it was built before */ 1553 if (cusparsestruct->matTranspose) { 1554 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1555 matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1556 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1557 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1558 A->cmap->n, matrix->num_entries, 1559 matrix->values->data().get(), 1560 cusparsestruct->rowoffsets_gpu->data().get(), 1561 matrix->column_indices->data().get(), 1562 matrixT->values->data().get(), 1563 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1564 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1565 CUSPARSE_ACTION_NUMERIC,indexBase, 1566 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1567 #else 1568 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1569 CUSPARSE_ACTION_NUMERIC, indexBase 1570 #endif 1571 );CHKERRCUSPARSE(stat); 1572 err = WaitForCUDA();CHKERRCUDA(err); 1573 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1574 } 1575 } else { 1576 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1577 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1578 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1579 delete cusparsestruct->workVector; 1580 delete cusparsestruct->rowoffsets_gpu; 1581 try { 1582 if (a->compressedrow.use) { 1583 m = a->compressedrow.nrows; 1584 ii = a->compressedrow.i; 1585 ridx = a->compressedrow.rindex; 1586 } else { 1587 m = A->rmap->n; 1588 ii = a->i; 1589 ridx = NULL; 1590 } 1591 cusparsestruct->nrows = m; 1592 1593 /* create cusparse matrix */ 1594 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1595 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1596 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1597 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1598 1599 err = cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar));CHKERRCUDA(err); 1600 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1601 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1602 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1603 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1604 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1605 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1606 1607 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1608 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1609 /* set the matrix */ 1610 CsrMatrix *mat= new CsrMatrix; 1611 mat->num_rows = m; 1612 mat->num_cols = A->cmap->n; 1613 mat->num_entries = a->nz; 1614 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1615 mat->row_offsets->assign(ii, ii + m+1); 1616 1617 mat->column_indices = new THRUSTINTARRAY32(a->nz); 1618 mat->column_indices->assign(a->j, a->j+a->nz); 1619 1620 mat->values = new THRUSTARRAY(a->nz); 1621 mat->values->assign(a->a, a->a+a->nz); 1622 1623 /* assign the pointer */ 1624 matstruct->mat = mat; 1625 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1626 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1627 stat = cusparseCreateCsr(&matstruct->matDescr, 1628 mat->num_rows, mat->num_cols, mat->num_entries, 1629 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1630 mat->values->data().get(), 1631 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1632 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1633 } 1634 #endif 1635 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1636 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1637 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1638 #else 1639 CsrMatrix *mat= new CsrMatrix; 1640 mat->num_rows = m; 1641 mat->num_cols = A->cmap->n; 1642 mat->num_entries = a->nz; 1643 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1644 mat->row_offsets->assign(ii, ii + m+1); 1645 1646 mat->column_indices = new THRUSTINTARRAY32(a->nz); 1647 mat->column_indices->assign(a->j, a->j+a->nz); 1648 1649 mat->values = new THRUSTARRAY(a->nz); 1650 mat->values->assign(a->a, a->a+a->nz); 1651 1652 cusparseHybMat_t hybMat; 1653 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1654 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1655 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1656 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1657 matstruct->descr, mat->values->data().get(), 1658 mat->row_offsets->data().get(), 1659 mat->column_indices->data().get(), 1660 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1661 /* assign the pointer */ 1662 matstruct->mat = hybMat; 1663 1664 if (mat) { 1665 if (mat->values) delete (THRUSTARRAY*)mat->values; 1666 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1667 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1668 delete (CsrMatrix*)mat; 1669 } 1670 #endif 1671 } 1672 1673 /* assign the compressed row indices */ 1674 if (a->compressedrow.use) { 1675 cusparsestruct->workVector = new THRUSTARRAY(m); 1676 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1677 matstruct->cprowIndices->assign(ridx,ridx+m); 1678 tmp = m; 1679 } else { 1680 cusparsestruct->workVector = NULL; 1681 matstruct->cprowIndices = NULL; 1682 tmp = 0; 1683 } 1684 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1685 1686 /* assign the pointer */ 1687 cusparsestruct->mat = matstruct; 1688 } catch(char *ex) { 1689 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1690 } 1691 err = WaitForCUDA();CHKERRCUDA(err); 1692 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1693 cusparsestruct->nonzerostate = A->nonzerostate; 1694 } 1695 A->offloadmask = PETSC_OFFLOAD_BOTH; 1696 } 1697 PetscFunctionReturn(0); 1698 } 1699 1700 struct VecCUDAPlusEquals 1701 { 1702 template <typename Tuple> 1703 __host__ __device__ 1704 void operator()(Tuple t) 1705 { 1706 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1707 } 1708 }; 1709 1710 struct VecCUDAEqualsReverse 1711 { 1712 template <typename Tuple> 1713 __host__ __device__ 1714 void operator()(Tuple t) 1715 { 1716 thrust::get<0>(t) = thrust::get<1>(t); 1717 } 1718 }; 1719 1720 struct MatMatCusparse { 1721 PetscBool cisdense; 1722 PetscScalar *Bt; 1723 Mat X; 1724 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1725 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1726 cusparseDnMatDescr_t matBDescr; 1727 cusparseDnMatDescr_t matCDescr; 1728 size_t spmmBufferSize; 1729 void *spmmBuffer; 1730 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1731 #endif 1732 }; 1733 1734 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1735 { 1736 PetscErrorCode ierr; 1737 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1738 cudaError_t cerr; 1739 1740 PetscFunctionBegin; 1741 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1742 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1743 cusparseStatus_t stat; 1744 if (mmdata->matBDescr) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);} 1745 if (mmdata->matCDescr) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);} 1746 if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 1747 #endif 1748 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1749 ierr = PetscFree(data);CHKERRQ(ierr); 1750 PetscFunctionReturn(0); 1751 } 1752 1753 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1754 1755 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1756 { 1757 Mat_Product *product = C->product; 1758 Mat A,B; 1759 PetscInt m,n,blda,clda; 1760 PetscBool flg,biscuda; 1761 Mat_SeqAIJCUSPARSE *cusp; 1762 cusparseStatus_t stat; 1763 cusparseOperation_t opA; 1764 const PetscScalar *barray; 1765 PetscScalar *carray; 1766 PetscErrorCode ierr; 1767 MatMatCusparse *mmdata; 1768 Mat_SeqAIJCUSPARSEMultStruct *mat; 1769 CsrMatrix *csrmat; 1770 cudaError_t cerr; 1771 1772 PetscFunctionBegin; 1773 MatCheckProduct(C,1); 1774 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1775 mmdata = (MatMatCusparse*)product->data; 1776 A = product->A; 1777 B = product->B; 1778 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1779 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1780 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1781 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1782 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1783 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1784 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1785 switch (product->type) { 1786 case MATPRODUCT_AB: 1787 case MATPRODUCT_PtAP: 1788 mat = cusp->mat; 1789 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1790 m = A->rmap->n; 1791 n = B->cmap->n; 1792 break; 1793 case MATPRODUCT_AtB: 1794 if (!cusp->transgen) { 1795 mat = cusp->mat; 1796 opA = CUSPARSE_OPERATION_TRANSPOSE; 1797 } else { 1798 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1799 mat = cusp->matTranspose; 1800 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1801 } 1802 m = A->cmap->n; 1803 n = B->cmap->n; 1804 break; 1805 case MATPRODUCT_ABt: 1806 case MATPRODUCT_RARt: 1807 mat = cusp->mat; 1808 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1809 m = A->rmap->n; 1810 n = B->rmap->n; 1811 break; 1812 default: 1813 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1814 } 1815 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1816 csrmat = (CsrMatrix*)mat->mat; 1817 /* if the user passed a CPU matrix, copy the data to the GPU */ 1818 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1819 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 1820 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 1821 1822 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 1823 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1824 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1825 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 1826 } else { 1827 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 1828 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 1829 } 1830 1831 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1832 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1833 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 1834 /* (re)allcoate spmmBuffer if not initialized or LDAs are different */ 1835 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 1836 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 1837 if (!mmdata->matBDescr) { 1838 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 1839 mmdata->Blda = blda; 1840 } 1841 1842 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 1843 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 1844 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 1845 mmdata->Clda = clda; 1846 } 1847 1848 if (!mat->matDescr) { 1849 stat = cusparseCreateCsr(&mat->matDescr, 1850 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 1851 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 1852 csrmat->values->data().get(), 1853 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1854 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1855 } 1856 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 1857 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 1858 mmdata->matCDescr,cusparse_scalartype, 1859 cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat); 1860 if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 1861 cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr); 1862 mmdata->initialized = PETSC_TRUE; 1863 } else { 1864 /* to be safe, always update pointers of the mats */ 1865 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 1866 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 1867 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 1868 } 1869 1870 /* do cusparseSpMM, which supports transpose on B */ 1871 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 1872 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 1873 mmdata->matCDescr,cusparse_scalartype, 1874 cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat); 1875 #else 1876 PetscInt k; 1877 /* cusparseXcsrmm does not support transpose on B */ 1878 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1879 cublasHandle_t cublasv2handle; 1880 cublasStatus_t cerr; 1881 1882 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 1883 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 1884 B->cmap->n,B->rmap->n, 1885 &PETSC_CUSPARSE_ONE ,barray,blda, 1886 &PETSC_CUSPARSE_ZERO,barray,blda, 1887 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 1888 blda = B->cmap->n; 1889 k = B->cmap->n; 1890 } else { 1891 k = B->rmap->n; 1892 } 1893 1894 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 1895 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 1896 csrmat->num_entries,mat->alpha_one,mat->descr, 1897 csrmat->values->data().get(), 1898 csrmat->row_offsets->data().get(), 1899 csrmat->column_indices->data().get(), 1900 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 1901 carray,clda);CHKERRCUSPARSE(stat); 1902 #endif 1903 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1904 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1905 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 1906 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 1907 if (product->type == MATPRODUCT_RARt) { 1908 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1909 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1910 } else if (product->type == MATPRODUCT_PtAP) { 1911 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1912 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1913 } else { 1914 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 1915 } 1916 if (mmdata->cisdense) { 1917 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 1918 } 1919 if (!biscuda) { 1920 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 1921 } 1922 PetscFunctionReturn(0); 1923 } 1924 1925 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1926 { 1927 Mat_Product *product = C->product; 1928 Mat A,B; 1929 PetscInt m,n; 1930 PetscBool cisdense,flg; 1931 PetscErrorCode ierr; 1932 MatMatCusparse *mmdata; 1933 Mat_SeqAIJCUSPARSE *cusp; 1934 1935 PetscFunctionBegin; 1936 MatCheckProduct(C,1); 1937 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 1938 A = product->A; 1939 B = product->B; 1940 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1941 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1942 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1943 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 1944 switch (product->type) { 1945 case MATPRODUCT_AB: 1946 m = A->rmap->n; 1947 n = B->cmap->n; 1948 break; 1949 case MATPRODUCT_AtB: 1950 m = A->cmap->n; 1951 n = B->cmap->n; 1952 break; 1953 case MATPRODUCT_ABt: 1954 m = A->rmap->n; 1955 n = B->rmap->n; 1956 break; 1957 case MATPRODUCT_PtAP: 1958 m = B->cmap->n; 1959 n = B->cmap->n; 1960 break; 1961 case MATPRODUCT_RARt: 1962 m = B->rmap->n; 1963 n = B->rmap->n; 1964 break; 1965 default: 1966 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1967 } 1968 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 1969 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 1970 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 1971 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 1972 1973 /* product data */ 1974 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 1975 mmdata->cisdense = cisdense; 1976 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 1977 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 1978 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1979 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 1980 } 1981 #endif 1982 /* for these products we need intermediate storage */ 1983 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1984 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 1985 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 1986 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 1987 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 1988 } else { 1989 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 1990 } 1991 } 1992 C->product->data = mmdata; 1993 C->product->destroy = MatDestroy_MatMatCusparse; 1994 1995 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 1996 PetscFunctionReturn(0); 1997 } 1998 1999 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2000 2001 /* handles dense B */ 2002 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C) 2003 { 2004 Mat_Product *product = C->product; 2005 PetscErrorCode ierr; 2006 2007 PetscFunctionBegin; 2008 MatCheckProduct(C,1); 2009 if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 2010 if (product->A->boundtocpu) { 2011 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr); 2012 PetscFunctionReturn(0); 2013 } 2014 switch (product->type) { 2015 case MATPRODUCT_AB: 2016 case MATPRODUCT_AtB: 2017 case MATPRODUCT_ABt: 2018 case MATPRODUCT_PtAP: 2019 case MATPRODUCT_RARt: 2020 C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2021 default: 2022 break; 2023 } 2024 PetscFunctionReturn(0); 2025 } 2026 2027 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2028 { 2029 PetscErrorCode ierr; 2030 2031 PetscFunctionBegin; 2032 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2033 PetscFunctionReturn(0); 2034 } 2035 2036 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2037 { 2038 PetscErrorCode ierr; 2039 2040 PetscFunctionBegin; 2041 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2042 PetscFunctionReturn(0); 2043 } 2044 2045 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2046 { 2047 PetscErrorCode ierr; 2048 2049 PetscFunctionBegin; 2050 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2051 PetscFunctionReturn(0); 2052 } 2053 2054 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2055 { 2056 PetscErrorCode ierr; 2057 2058 PetscFunctionBegin; 2059 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2060 PetscFunctionReturn(0); 2061 } 2062 2063 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2064 { 2065 PetscErrorCode ierr; 2066 2067 PetscFunctionBegin; 2068 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2069 PetscFunctionReturn(0); 2070 } 2071 2072 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2073 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2074 { 2075 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2076 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2077 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2078 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2079 PetscErrorCode ierr; 2080 cudaError_t cerr; 2081 cusparseStatus_t stat; 2082 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2083 PetscBool compressed; 2084 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2085 PetscInt nx,ny; 2086 #endif 2087 2088 PetscFunctionBegin; 2089 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2090 if (!a->nonzerorowcnt) { 2091 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2092 PetscFunctionReturn(0); 2093 } 2094 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2095 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2096 if (!trans) { 2097 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2098 } else { 2099 if (herm || !cusparsestruct->transgen) { 2100 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2101 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2102 } else { 2103 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);} 2104 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2105 } 2106 } 2107 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2108 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2109 2110 try { 2111 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2112 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2113 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2114 2115 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2116 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2117 /* z = A x + beta y. 2118 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2119 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2120 */ 2121 xptr = xarray; 2122 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2123 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2124 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2125 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2126 allocated to accommodate different uses. So we get the length info directly from mat. 2127 */ 2128 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2129 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2130 nx = mat->num_cols; 2131 ny = mat->num_rows; 2132 } 2133 #endif 2134 } else { 2135 /* z = A^T x + beta y 2136 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2137 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2138 */ 2139 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2140 dptr = zarray; 2141 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2142 if (compressed) { /* Scatter x to work vector */ 2143 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2144 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2145 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2146 VecCUDAEqualsReverse()); 2147 } 2148 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2149 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2150 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2151 nx = mat->num_rows; 2152 ny = mat->num_cols; 2153 } 2154 #endif 2155 } 2156 2157 /* csr_spmv does y = alpha op(A) x + beta y */ 2158 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2159 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2160 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2161 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2162 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2163 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2164 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2165 matstruct->matDescr, 2166 matstruct->cuSpMV[opA].vecXDescr, beta, 2167 matstruct->cuSpMV[opA].vecYDescr, 2168 cusparse_scalartype, 2169 cusparsestruct->spmvAlg, 2170 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2171 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2172 2173 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2174 } else { 2175 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2176 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2177 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2178 } 2179 2180 stat = cusparseSpMV(cusparsestruct->handle, opA, 2181 matstruct->alpha_one, 2182 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */ 2183 matstruct->cuSpMV[opA].vecXDescr, 2184 beta, 2185 matstruct->cuSpMV[opA].vecYDescr, 2186 cusparse_scalartype, 2187 cusparsestruct->spmvAlg, 2188 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2189 #else 2190 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2191 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2192 mat->num_rows, mat->num_cols, 2193 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2194 mat->values->data().get(), mat->row_offsets->data().get(), 2195 mat->column_indices->data().get(), xptr, beta, 2196 dptr);CHKERRCUSPARSE(stat); 2197 #endif 2198 } else { 2199 if (cusparsestruct->nrows) { 2200 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2201 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2202 #else 2203 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2204 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2205 matstruct->alpha_one, matstruct->descr, hybMat, 2206 xptr, beta, 2207 dptr);CHKERRCUSPARSE(stat); 2208 #endif 2209 } 2210 } 2211 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2212 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2213 2214 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2215 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2216 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2217 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2218 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2219 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2220 } 2221 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2222 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2223 } 2224 2225 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2226 if (compressed) { 2227 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2228 2229 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2230 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2231 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2232 VecCUDAPlusEquals()); 2233 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2234 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2235 } 2236 } else { 2237 if (yy && yy != zz) { 2238 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2239 } 2240 } 2241 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2242 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2243 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2244 } catch(char *ex) { 2245 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2246 } 2247 if (yy) { 2248 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2249 } else { 2250 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2251 } 2252 PetscFunctionReturn(0); 2253 } 2254 2255 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2256 { 2257 PetscErrorCode ierr; 2258 2259 PetscFunctionBegin; 2260 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2261 PetscFunctionReturn(0); 2262 } 2263 2264 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2265 { 2266 PetscErrorCode ierr; 2267 PetscSplitCSRDataStructure *d_mat = NULL, h_mat; 2268 PetscBool is_seq = PETSC_TRUE; 2269 PetscInt nnz_state = A->nonzerostate; 2270 PetscFunctionBegin; 2271 if (A->factortype == MAT_FACTOR_NONE) { 2272 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2273 } 2274 if (d_mat) { 2275 cudaError_t err; 2276 ierr = PetscInfo(A,"Assemble device matrix\n");CHKERRQ(ierr); 2277 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2278 nnz_state = h_mat.nonzerostate; 2279 is_seq = h_mat.seq; 2280 } 2281 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2282 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2283 if (A->factortype == MAT_FACTOR_NONE && A->nonzerostate >= nnz_state && is_seq) { // assembled on CPU eventhough equiped for GPU 2284 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2285 } else if (nnz_state > A->nonzerostate) { 2286 A->offloadmask = PETSC_OFFLOAD_GPU; 2287 } 2288 2289 PetscFunctionReturn(0); 2290 } 2291 2292 /* --------------------------------------------------------------------------------*/ 2293 /*@ 2294 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2295 (the default parallel PETSc format). This matrix will ultimately pushed down 2296 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2297 assembly performance the user should preallocate the matrix storage by setting 2298 the parameter nz (or the array nnz). By setting these parameters accurately, 2299 performance during matrix assembly can be increased by more than a factor of 50. 2300 2301 Collective 2302 2303 Input Parameters: 2304 + comm - MPI communicator, set to PETSC_COMM_SELF 2305 . m - number of rows 2306 . n - number of columns 2307 . nz - number of nonzeros per row (same for all rows) 2308 - nnz - array containing the number of nonzeros in the various rows 2309 (possibly different for each row) or NULL 2310 2311 Output Parameter: 2312 . A - the matrix 2313 2314 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2315 MatXXXXSetPreallocation() paradgm instead of this routine directly. 2316 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 2317 2318 Notes: 2319 If nnz is given then nz is ignored 2320 2321 The AIJ format (also called the Yale sparse matrix format or 2322 compressed row storage), is fully compatible with standard Fortran 77 2323 storage. That is, the stored row and column indices can begin at 2324 either one (as in Fortran) or zero. See the users' manual for details. 2325 2326 Specify the preallocated storage with either nz or nnz (not both). 2327 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 2328 allocation. For large problems you MUST preallocate memory or you 2329 will get TERRIBLE performance, see the users' manual chapter on matrices. 2330 2331 By default, this format uses inodes (identical nodes) when possible, to 2332 improve numerical efficiency of matrix-vector products and solves. We 2333 search for consecutive rows with the same nonzero structure, thereby 2334 reusing matrix information to achieve increased efficiency. 2335 2336 Level: intermediate 2337 2338 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 2339 @*/ 2340 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 2341 { 2342 PetscErrorCode ierr; 2343 2344 PetscFunctionBegin; 2345 ierr = MatCreate(comm,A);CHKERRQ(ierr); 2346 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 2347 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2348 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 2349 PetscFunctionReturn(0); 2350 } 2351 2352 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 2353 { 2354 PetscErrorCode ierr; 2355 PetscSplitCSRDataStructure *d_mat = NULL; 2356 2357 PetscFunctionBegin; 2358 if (A->factortype == MAT_FACTOR_NONE) { 2359 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2360 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 2361 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 2362 } else { 2363 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 2364 } 2365 if (d_mat) { 2366 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2367 cudaError_t err; 2368 PetscSplitCSRDataStructure h_mat; 2369 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 2370 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2371 if (h_mat.seq) { 2372 if (a->compressedrow.use) { 2373 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 2374 } 2375 err = cudaFree(d_mat);CHKERRCUDA(err); 2376 } 2377 } 2378 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 2379 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2380 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 2381 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 2382 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 2383 PetscFunctionReturn(0); 2384 } 2385 2386 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 2387 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 2388 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 2389 { 2390 PetscErrorCode ierr; 2391 2392 PetscFunctionBegin; 2393 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 2394 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 2395 PetscFunctionReturn(0); 2396 } 2397 2398 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 2399 { 2400 PetscErrorCode ierr; 2401 2402 PetscFunctionBegin; 2403 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 2404 /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU. 2405 If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one. 2406 Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case. 2407 TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries; 2408 can follow the example of MatBindToCPU_SeqAIJViennaCL(). */ 2409 if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov"); 2410 /* TODO: add support for this? */ 2411 if (flg) { 2412 A->ops->mult = MatMult_SeqAIJ; 2413 A->ops->multadd = MatMultAdd_SeqAIJ; 2414 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 2415 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 2416 A->ops->multhermitiantranspose = NULL; 2417 A->ops->multhermitiantransposeadd = NULL; 2418 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2419 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 2420 } else { 2421 A->ops->mult = MatMult_SeqAIJCUSPARSE; 2422 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 2423 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 2424 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 2425 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 2426 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 2427 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2428 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2429 } 2430 A->boundtocpu = flg; 2431 PetscFunctionReturn(0); 2432 } 2433 2434 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 2435 { 2436 PetscSplitCSRDataStructure *d_mat = NULL; 2437 PetscErrorCode ierr; 2438 PetscFunctionBegin; 2439 if (A->factortype == MAT_FACTOR_NONE) { 2440 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 2441 d_mat = spptr->deviceMat; 2442 } 2443 if (d_mat) { 2444 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2445 PetscInt n = A->rmap->n, nnz = a->i[n]; 2446 cudaError_t err; 2447 PetscScalar *vals; 2448 ierr = PetscInfo(A,"Zero device matrix\n");CHKERRQ(ierr); 2449 err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2450 err = cudaMemset( vals, 0, (nnz)*sizeof(PetscScalar));CHKERRCUDA(err); 2451 } 2452 ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 2453 2454 A->offloadmask = PETSC_OFFLOAD_BOTH; 2455 2456 PetscFunctionReturn(0); 2457 } 2458 2459 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 2460 { 2461 PetscErrorCode ierr; 2462 cusparseStatus_t stat; 2463 Mat B; 2464 2465 PetscFunctionBegin; 2466 if (reuse == MAT_INITIAL_MATRIX) { 2467 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 2468 } else if (reuse == MAT_REUSE_MATRIX) { 2469 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 2470 } 2471 B = *newmat; 2472 2473 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 2474 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 2475 2476 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 2477 if (B->factortype == MAT_FACTOR_NONE) { 2478 Mat_SeqAIJCUSPARSE *spptr; 2479 2480 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2481 spptr->format = MAT_CUSPARSE_CSR; 2482 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2483 B->spptr = spptr; 2484 spptr->deviceMat = NULL; 2485 } else { 2486 Mat_SeqAIJCUSPARSETriFactors *spptr; 2487 2488 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2489 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2490 B->spptr = spptr; 2491 } 2492 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2493 } 2494 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 2495 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 2496 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 2497 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 2498 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 2499 B->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 2500 2501 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 2502 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2503 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 2504 PetscFunctionReturn(0); 2505 } 2506 2507 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 2508 { 2509 PetscErrorCode ierr; 2510 2511 PetscFunctionBegin; 2512 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 2513 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 2514 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2515 ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr); 2516 ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr); 2517 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2518 PetscFunctionReturn(0); 2519 } 2520 2521 /*MC 2522 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 2523 2524 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 2525 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 2526 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 2527 2528 Options Database Keys: 2529 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 2530 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2531 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2532 2533 Level: beginner 2534 2535 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 2536 M*/ 2537 2538 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 2539 2540 2541 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 2542 { 2543 PetscErrorCode ierr; 2544 2545 PetscFunctionBegin; 2546 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2547 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2548 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2549 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2550 PetscFunctionReturn(0); 2551 } 2552 2553 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 2554 { 2555 PetscErrorCode ierr; 2556 cusparseStatus_t stat; 2557 cusparseHandle_t handle; 2558 2559 PetscFunctionBegin; 2560 if (*cusparsestruct) { 2561 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 2562 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 2563 delete (*cusparsestruct)->workVector; 2564 delete (*cusparsestruct)->rowoffsets_gpu; 2565 if (handle = (*cusparsestruct)->handle) {stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);} 2566 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2567 cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr); 2568 #endif 2569 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 2570 } 2571 PetscFunctionReturn(0); 2572 } 2573 2574 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 2575 { 2576 PetscFunctionBegin; 2577 if (*mat) { 2578 delete (*mat)->values; 2579 delete (*mat)->column_indices; 2580 delete (*mat)->row_offsets; 2581 delete *mat; 2582 *mat = 0; 2583 } 2584 PetscFunctionReturn(0); 2585 } 2586 2587 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 2588 { 2589 cusparseStatus_t stat; 2590 PetscErrorCode ierr; 2591 2592 PetscFunctionBegin; 2593 if (*trifactor) { 2594 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 2595 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 2596 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 2597 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2598 cudaError_t cerr; 2599 if ((*trifactor)->solveBuffer) {cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 2600 if ((*trifactor)->csr2cscBuffer) {cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 2601 #endif 2602 delete *trifactor; 2603 *trifactor = 0; 2604 } 2605 PetscFunctionReturn(0); 2606 } 2607 2608 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 2609 { 2610 CsrMatrix *mat; 2611 cusparseStatus_t stat; 2612 cudaError_t err; 2613 2614 PetscFunctionBegin; 2615 if (*matstruct) { 2616 if ((*matstruct)->mat) { 2617 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 2618 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2619 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2620 #else 2621 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 2622 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 2623 #endif 2624 } else { 2625 mat = (CsrMatrix*)(*matstruct)->mat; 2626 CsrMatrix_Destroy(&mat); 2627 } 2628 } 2629 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 2630 delete (*matstruct)->cprowIndices; 2631 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 2632 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 2633 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 2634 2635 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2636 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 2637 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 2638 for (int i=0; i<3; i++) { 2639 if (mdata->cuSpMV[i].initialized) { 2640 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 2641 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 2642 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 2643 } 2644 } 2645 #endif 2646 delete *matstruct; 2647 *matstruct = 0; 2648 } 2649 PetscFunctionReturn(0); 2650 } 2651 2652 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2653 { 2654 PetscErrorCode ierr; 2655 2656 PetscFunctionBegin; 2657 if (*trifactors) { 2658 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 2659 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 2660 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 2661 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 2662 delete (*trifactors)->rpermIndices; 2663 delete (*trifactors)->cpermIndices; 2664 delete (*trifactors)->workVector; 2665 (*trifactors)->rpermIndices = 0; 2666 (*trifactors)->cpermIndices = 0; 2667 (*trifactors)->workVector = 0; 2668 } 2669 PetscFunctionReturn(0); 2670 } 2671 2672 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2673 { 2674 PetscErrorCode ierr; 2675 cusparseHandle_t handle; 2676 cusparseStatus_t stat; 2677 2678 PetscFunctionBegin; 2679 if (*trifactors) { 2680 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 2681 if (handle = (*trifactors)->handle) { 2682 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 2683 } 2684 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 2685 } 2686 PetscFunctionReturn(0); 2687 } 2688