1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 68 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 69 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 74 75 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 76 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 78 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 80 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 81 82 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 83 { 84 cusparseStatus_t stat; 85 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 86 87 PetscFunctionBegin; 88 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 89 cusparsestruct->stream = stream; 90 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 91 PetscFunctionReturn(0); 92 } 93 94 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 95 { 96 cusparseStatus_t stat; 97 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 98 99 PetscFunctionBegin; 100 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 101 if (cusparsestruct->handle != handle) { 102 if (cusparsestruct->handle) { 103 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 104 } 105 cusparsestruct->handle = handle; 106 } 107 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 108 PetscFunctionReturn(0); 109 } 110 111 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 112 { 113 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 114 115 PetscFunctionBegin; 116 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 117 if (cusparsestruct->handle) cusparsestruct->handle = 0; 118 PetscFunctionReturn(0); 119 } 120 121 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 122 { 123 PetscFunctionBegin; 124 *type = MATSOLVERCUSPARSE; 125 PetscFunctionReturn(0); 126 } 127 128 /*MC 129 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 130 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 131 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 132 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 133 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 134 algorithms are not recommended. This class does NOT support direct solver operations. 135 136 Level: beginner 137 138 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 139 M*/ 140 141 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 142 { 143 PetscErrorCode ierr; 144 PetscInt n = A->rmap->n; 145 146 PetscFunctionBegin; 147 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 148 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 149 (*B)->factortype = ftype; 150 (*B)->useordering = PETSC_TRUE; 151 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 152 153 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 154 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 155 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 156 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 157 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 158 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 159 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 160 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 161 162 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 163 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 164 PetscFunctionReturn(0); 165 } 166 167 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 168 { 169 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 170 171 PetscFunctionBegin; 172 switch (op) { 173 case MAT_CUSPARSE_MULT: 174 cusparsestruct->format = format; 175 break; 176 case MAT_CUSPARSE_ALL: 177 cusparsestruct->format = format; 178 break; 179 default: 180 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 181 } 182 PetscFunctionReturn(0); 183 } 184 185 /*@ 186 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 187 operation. Only the MatMult operation can use different GPU storage formats 188 for MPIAIJCUSPARSE matrices. 189 Not Collective 190 191 Input Parameters: 192 + A - Matrix of type SEQAIJCUSPARSE 193 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 194 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 195 196 Output Parameter: 197 198 Level: intermediate 199 200 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 201 @*/ 202 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 203 { 204 PetscErrorCode ierr; 205 206 PetscFunctionBegin; 207 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 208 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 209 PetscFunctionReturn(0); 210 } 211 212 /*@ 213 MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose 214 215 Collective on mat 216 217 Input Parameters: 218 + A - Matrix of type SEQAIJCUSPARSE 219 - transgen - the boolean flag 220 221 Level: intermediate 222 223 .seealso: MATSEQAIJCUSPARSE 224 @*/ 225 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 226 { 227 PetscErrorCode ierr; 228 PetscBool flg; 229 230 PetscFunctionBegin; 231 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 232 ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 233 if (flg) { 234 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 235 236 if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 237 cusp->transgen = transgen; 238 if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 239 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 240 } 241 } 242 PetscFunctionReturn(0); 243 } 244 245 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 246 { 247 PetscErrorCode ierr; 248 MatCUSPARSEStorageFormat format; 249 PetscBool flg; 250 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 251 252 PetscFunctionBegin; 253 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 254 if (A->factortype == MAT_FACTOR_NONE) { 255 PetscBool transgen = cusparsestruct->transgen; 256 257 ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 258 if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);} 259 260 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 261 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 262 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 263 264 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 265 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 266 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 267 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 268 cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 269 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 270 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 271 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 272 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 273 274 cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 275 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 276 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 277 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 278 279 cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 280 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 281 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 282 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 283 #endif 284 } 285 ierr = PetscOptionsTail();CHKERRQ(ierr); 286 PetscFunctionReturn(0); 287 } 288 289 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 290 { 291 PetscErrorCode ierr; 292 293 PetscFunctionBegin; 294 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 295 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 296 PetscFunctionReturn(0); 297 } 298 299 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 300 { 301 PetscErrorCode ierr; 302 303 PetscFunctionBegin; 304 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 305 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 306 PetscFunctionReturn(0); 307 } 308 309 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 310 { 311 PetscErrorCode ierr; 312 313 PetscFunctionBegin; 314 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 315 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 316 PetscFunctionReturn(0); 317 } 318 319 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 320 { 321 PetscErrorCode ierr; 322 323 PetscFunctionBegin; 324 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 325 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 326 PetscFunctionReturn(0); 327 } 328 329 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 330 { 331 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 332 PetscInt n = A->rmap->n; 333 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 334 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 335 cusparseStatus_t stat; 336 const PetscInt *ai = a->i,*aj = a->j,*vi; 337 const MatScalar *aa = a->a,*v; 338 PetscInt *AiLo, *AjLo; 339 PetscScalar *AALo; 340 PetscInt i,nz, nzLower, offset, rowOffset; 341 PetscErrorCode ierr; 342 cudaError_t cerr; 343 344 PetscFunctionBegin; 345 if (!n) PetscFunctionReturn(0); 346 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 347 try { 348 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 349 nzLower=n+ai[n]-ai[1]; 350 351 /* Allocate Space for the lower triangular matrix */ 352 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 353 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 354 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 355 356 /* Fill the lower triangular matrix */ 357 AiLo[0] = (PetscInt) 0; 358 AiLo[n] = nzLower; 359 AjLo[0] = (PetscInt) 0; 360 AALo[0] = (MatScalar) 1.0; 361 v = aa; 362 vi = aj; 363 offset = 1; 364 rowOffset= 1; 365 for (i=1; i<n; i++) { 366 nz = ai[i+1] - ai[i]; 367 /* additional 1 for the term on the diagonal */ 368 AiLo[i] = rowOffset; 369 rowOffset += nz+1; 370 371 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 372 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 373 374 offset += nz; 375 AjLo[offset] = (PetscInt) i; 376 AALo[offset] = (MatScalar) 1.0; 377 offset += 1; 378 379 v += nz; 380 vi += nz; 381 } 382 383 /* allocate space for the triangular factor information */ 384 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 385 386 /* Create the matrix description */ 387 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 388 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 389 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 390 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 391 #else 392 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 393 #endif 394 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 395 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 396 397 /* set the operation */ 398 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 399 400 /* set the matrix */ 401 loTriFactor->csrMat = new CsrMatrix; 402 loTriFactor->csrMat->num_rows = n; 403 loTriFactor->csrMat->num_cols = n; 404 loTriFactor->csrMat->num_entries = nzLower; 405 406 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 407 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 408 409 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 410 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 411 412 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 413 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 414 415 /* Create the solve analysis information */ 416 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 417 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 418 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 419 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 420 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 421 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 422 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 423 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 424 #endif 425 426 /* perform the solve analysis */ 427 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 428 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 429 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 430 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 431 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 432 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 433 #endif 434 );CHKERRCUSPARSE(stat); 435 436 /* assign the pointer. Is this really necessary? */ 437 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 438 439 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 440 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 441 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 442 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 443 } catch(char *ex) { 444 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 445 } 446 } 447 PetscFunctionReturn(0); 448 } 449 450 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 451 { 452 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 453 PetscInt n = A->rmap->n; 454 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 455 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 456 cusparseStatus_t stat; 457 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 458 const MatScalar *aa = a->a,*v; 459 PetscInt *AiUp, *AjUp; 460 PetscScalar *AAUp; 461 PetscInt i,nz, nzUpper, offset; 462 PetscErrorCode ierr; 463 cudaError_t cerr; 464 465 PetscFunctionBegin; 466 if (!n) PetscFunctionReturn(0); 467 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 468 try { 469 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 470 nzUpper = adiag[0]-adiag[n]; 471 472 /* Allocate Space for the upper triangular matrix */ 473 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 474 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 475 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 476 477 /* Fill the upper triangular matrix */ 478 AiUp[0]=(PetscInt) 0; 479 AiUp[n]=nzUpper; 480 offset = nzUpper; 481 for (i=n-1; i>=0; i--) { 482 v = aa + adiag[i+1] + 1; 483 vi = aj + adiag[i+1] + 1; 484 485 /* number of elements NOT on the diagonal */ 486 nz = adiag[i] - adiag[i+1]-1; 487 488 /* decrement the offset */ 489 offset -= (nz+1); 490 491 /* first, set the diagonal elements */ 492 AjUp[offset] = (PetscInt) i; 493 AAUp[offset] = (MatScalar)1./v[nz]; 494 AiUp[i] = AiUp[i+1] - (nz+1); 495 496 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 497 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 498 } 499 500 /* allocate space for the triangular factor information */ 501 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 502 503 /* Create the matrix description */ 504 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 505 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 506 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 507 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 508 #else 509 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 510 #endif 511 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 512 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 513 514 /* set the operation */ 515 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 516 517 /* set the matrix */ 518 upTriFactor->csrMat = new CsrMatrix; 519 upTriFactor->csrMat->num_rows = n; 520 upTriFactor->csrMat->num_cols = n; 521 upTriFactor->csrMat->num_entries = nzUpper; 522 523 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 524 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 525 526 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 527 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 528 529 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 530 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 531 532 /* Create the solve analysis information */ 533 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 534 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 535 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 536 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 537 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 538 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 539 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 540 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 541 #endif 542 543 /* perform the solve analysis */ 544 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 545 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 546 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 547 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 548 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 549 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 550 #endif 551 );CHKERRCUSPARSE(stat); 552 553 /* assign the pointer. Is this really necessary? */ 554 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 555 556 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 557 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 558 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 559 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 560 } catch(char *ex) { 561 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 562 } 563 } 564 PetscFunctionReturn(0); 565 } 566 567 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 568 { 569 PetscErrorCode ierr; 570 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 571 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 572 IS isrow = a->row,iscol = a->icol; 573 PetscBool row_identity,col_identity; 574 const PetscInt *r,*c; 575 PetscInt n = A->rmap->n; 576 577 PetscFunctionBegin; 578 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 579 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 580 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 581 582 cusparseTriFactors->workVector = new THRUSTARRAY(n); 583 cusparseTriFactors->nnz=a->nz; 584 585 A->offloadmask = PETSC_OFFLOAD_BOTH; 586 /* lower triangular indices */ 587 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 588 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 589 if (!row_identity) { 590 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 591 cusparseTriFactors->rpermIndices->assign(r, r+n); 592 } 593 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 594 595 /* upper triangular indices */ 596 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 597 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 598 if (!col_identity) { 599 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 600 cusparseTriFactors->cpermIndices->assign(c, c+n); 601 } 602 603 if (!row_identity && !col_identity) { 604 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 605 } else if(!row_identity) { 606 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 607 } else if(!col_identity) { 608 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 609 } 610 611 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 612 PetscFunctionReturn(0); 613 } 614 615 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 616 { 617 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 618 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 619 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 620 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 621 cusparseStatus_t stat; 622 PetscErrorCode ierr; 623 cudaError_t cerr; 624 PetscInt *AiUp, *AjUp; 625 PetscScalar *AAUp; 626 PetscScalar *AALo; 627 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 628 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 629 const PetscInt *ai = b->i,*aj = b->j,*vj; 630 const MatScalar *aa = b->a,*v; 631 632 PetscFunctionBegin; 633 if (!n) PetscFunctionReturn(0); 634 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 635 try { 636 /* Allocate Space for the upper triangular matrix */ 637 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 638 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 639 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 640 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 641 642 /* Fill the upper triangular matrix */ 643 AiUp[0]=(PetscInt) 0; 644 AiUp[n]=nzUpper; 645 offset = 0; 646 for (i=0; i<n; i++) { 647 /* set the pointers */ 648 v = aa + ai[i]; 649 vj = aj + ai[i]; 650 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 651 652 /* first, set the diagonal elements */ 653 AjUp[offset] = (PetscInt) i; 654 AAUp[offset] = (MatScalar)1.0/v[nz]; 655 AiUp[i] = offset; 656 AALo[offset] = (MatScalar)1.0/v[nz]; 657 658 offset+=1; 659 if (nz>0) { 660 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 661 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 662 for (j=offset; j<offset+nz; j++) { 663 AAUp[j] = -AAUp[j]; 664 AALo[j] = AAUp[j]/v[nz]; 665 } 666 offset+=nz; 667 } 668 } 669 670 /* allocate space for the triangular factor information */ 671 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 672 673 /* Create the matrix description */ 674 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 675 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 676 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 677 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 678 #else 679 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 680 #endif 681 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 682 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 683 684 /* set the matrix */ 685 upTriFactor->csrMat = new CsrMatrix; 686 upTriFactor->csrMat->num_rows = A->rmap->n; 687 upTriFactor->csrMat->num_cols = A->cmap->n; 688 upTriFactor->csrMat->num_entries = a->nz; 689 690 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 691 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 692 693 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 694 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 695 696 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 697 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 698 699 /* set the operation */ 700 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 701 702 /* Create the solve analysis information */ 703 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 704 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 705 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 706 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 707 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 708 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 709 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 710 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 711 #endif 712 713 /* perform the solve analysis */ 714 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 715 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 716 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 717 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 718 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 719 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 720 #endif 721 );CHKERRCUSPARSE(stat); 722 723 /* assign the pointer. Is this really necessary? */ 724 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 725 726 /* allocate space for the triangular factor information */ 727 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 728 729 /* Create the matrix description */ 730 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 731 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 732 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 733 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 734 #else 735 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 736 #endif 737 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 738 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 739 740 /* set the operation */ 741 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 742 743 /* set the matrix */ 744 loTriFactor->csrMat = new CsrMatrix; 745 loTriFactor->csrMat->num_rows = A->rmap->n; 746 loTriFactor->csrMat->num_cols = A->cmap->n; 747 loTriFactor->csrMat->num_entries = a->nz; 748 749 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 750 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 751 752 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 753 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 754 755 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 756 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 757 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 758 759 /* Create the solve analysis information */ 760 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 761 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 762 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 763 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 764 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 765 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 766 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 767 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 768 #endif 769 770 /* perform the solve analysis */ 771 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 772 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 773 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 774 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 775 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 776 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 777 #endif 778 );CHKERRCUSPARSE(stat); 779 780 /* assign the pointer. Is this really necessary? */ 781 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 782 783 A->offloadmask = PETSC_OFFLOAD_BOTH; 784 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 785 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 786 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 787 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 788 } catch(char *ex) { 789 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 790 } 791 } 792 PetscFunctionReturn(0); 793 } 794 795 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 796 { 797 PetscErrorCode ierr; 798 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 799 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 800 IS ip = a->row; 801 const PetscInt *rip; 802 PetscBool perm_identity; 803 PetscInt n = A->rmap->n; 804 805 PetscFunctionBegin; 806 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 807 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 808 cusparseTriFactors->workVector = new THRUSTARRAY(n); 809 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 810 811 /* lower triangular indices */ 812 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 813 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 814 if (!perm_identity) { 815 IS iip; 816 const PetscInt *irip; 817 818 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 819 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 820 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 821 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 822 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 823 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 824 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 825 ierr = ISDestroy(&iip);CHKERRQ(ierr); 826 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 827 } 828 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 829 PetscFunctionReturn(0); 830 } 831 832 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 833 { 834 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 835 IS isrow = b->row,iscol = b->col; 836 PetscBool row_identity,col_identity; 837 PetscErrorCode ierr; 838 839 PetscFunctionBegin; 840 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 841 B->offloadmask = PETSC_OFFLOAD_CPU; 842 /* determine which version of MatSolve needs to be used. */ 843 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 844 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 845 if (row_identity && col_identity) { 846 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 847 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 848 B->ops->matsolve = NULL; 849 B->ops->matsolvetranspose = NULL; 850 } else { 851 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 852 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 853 B->ops->matsolve = NULL; 854 B->ops->matsolvetranspose = NULL; 855 } 856 857 /* get the triangular factors */ 858 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 859 PetscFunctionReturn(0); 860 } 861 862 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 863 { 864 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 865 IS ip = b->row; 866 PetscBool perm_identity; 867 PetscErrorCode ierr; 868 869 PetscFunctionBegin; 870 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 871 B->offloadmask = PETSC_OFFLOAD_CPU; 872 /* determine which version of MatSolve needs to be used. */ 873 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 874 if (perm_identity) { 875 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 876 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 877 B->ops->matsolve = NULL; 878 B->ops->matsolvetranspose = NULL; 879 } else { 880 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 881 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 882 B->ops->matsolve = NULL; 883 B->ops->matsolvetranspose = NULL; 884 } 885 886 /* get the triangular factors */ 887 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 888 PetscFunctionReturn(0); 889 } 890 891 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 892 { 893 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 894 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 895 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 896 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 897 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 898 cusparseStatus_t stat; 899 cusparseIndexBase_t indexBase; 900 cusparseMatrixType_t matrixType; 901 cusparseFillMode_t fillMode; 902 cusparseDiagType_t diagType; 903 904 PetscFunctionBegin; 905 906 /*********************************************/ 907 /* Now the Transpose of the Lower Tri Factor */ 908 /*********************************************/ 909 910 /* allocate space for the transpose of the lower triangular factor */ 911 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 912 913 /* set the matrix descriptors of the lower triangular factor */ 914 matrixType = cusparseGetMatType(loTriFactor->descr); 915 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 916 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 917 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 918 diagType = cusparseGetMatDiagType(loTriFactor->descr); 919 920 /* Create the matrix description */ 921 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 922 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 923 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 924 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 925 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 926 927 /* set the operation */ 928 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 929 930 /* allocate GPU space for the CSC of the lower triangular factor*/ 931 loTriFactorT->csrMat = new CsrMatrix; 932 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 933 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 934 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 935 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 936 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 937 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 938 939 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 940 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 941 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 942 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 943 loTriFactor->csrMat->values->data().get(), 944 loTriFactor->csrMat->row_offsets->data().get(), 945 loTriFactor->csrMat->column_indices->data().get(), 946 loTriFactorT->csrMat->values->data().get(), 947 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 948 CUSPARSE_ACTION_NUMERIC,indexBase, 949 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 950 cudaError_t cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 951 #endif 952 953 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 954 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 955 loTriFactor->csrMat->values->data().get(), 956 loTriFactor->csrMat->row_offsets->data().get(), 957 loTriFactor->csrMat->column_indices->data().get(), 958 loTriFactorT->csrMat->values->data().get(), 959 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 960 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 961 CUSPARSE_ACTION_NUMERIC, indexBase, 962 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 963 #else 964 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 965 CUSPARSE_ACTION_NUMERIC, indexBase 966 #endif 967 );CHKERRCUSPARSE(stat); 968 969 /* Create the solve analysis information */ 970 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 971 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 972 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 973 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 974 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 975 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 976 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 977 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 978 #endif 979 980 /* perform the solve analysis */ 981 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 982 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 983 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 984 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 985 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 986 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 987 #endif 988 );CHKERRCUSPARSE(stat); 989 990 /* assign the pointer. Is this really necessary? */ 991 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 992 993 /*********************************************/ 994 /* Now the Transpose of the Upper Tri Factor */ 995 /*********************************************/ 996 997 /* allocate space for the transpose of the upper triangular factor */ 998 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 999 1000 /* set the matrix descriptors of the upper triangular factor */ 1001 matrixType = cusparseGetMatType(upTriFactor->descr); 1002 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1003 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1004 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1005 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1006 1007 /* Create the matrix description */ 1008 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1009 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1010 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1011 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1012 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1013 1014 /* set the operation */ 1015 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1016 1017 /* allocate GPU space for the CSC of the upper triangular factor*/ 1018 upTriFactorT->csrMat = new CsrMatrix; 1019 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1020 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1021 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1022 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1023 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1024 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1025 1026 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1027 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1028 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1029 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1030 upTriFactor->csrMat->values->data().get(), 1031 upTriFactor->csrMat->row_offsets->data().get(), 1032 upTriFactor->csrMat->column_indices->data().get(), 1033 upTriFactorT->csrMat->values->data().get(), 1034 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1035 CUSPARSE_ACTION_NUMERIC,indexBase, 1036 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1037 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1038 #endif 1039 1040 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1041 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1042 upTriFactor->csrMat->values->data().get(), 1043 upTriFactor->csrMat->row_offsets->data().get(), 1044 upTriFactor->csrMat->column_indices->data().get(), 1045 upTriFactorT->csrMat->values->data().get(), 1046 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1047 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1048 CUSPARSE_ACTION_NUMERIC, indexBase, 1049 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1050 #else 1051 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1052 CUSPARSE_ACTION_NUMERIC, indexBase 1053 #endif 1054 );CHKERRCUSPARSE(stat); 1055 1056 /* Create the solve analysis information */ 1057 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1058 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1059 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1060 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1061 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1062 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1063 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1064 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1065 #endif 1066 1067 /* perform the solve analysis */ 1068 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1069 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1070 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1071 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1072 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1073 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1074 #endif 1075 );CHKERRCUSPARSE(stat); 1076 1077 /* assign the pointer. Is this really necessary? */ 1078 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1079 PetscFunctionReturn(0); 1080 } 1081 1082 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 1083 { 1084 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1085 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1086 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1087 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1088 cusparseStatus_t stat; 1089 cusparseIndexBase_t indexBase; 1090 cudaError_t err; 1091 PetscErrorCode ierr; 1092 1093 PetscFunctionBegin; 1094 if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0); 1095 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1096 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1097 /* create cusparse matrix */ 1098 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1099 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1100 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1101 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1102 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1103 1104 /* set alpha and beta */ 1105 err = cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar));CHKERRCUDA(err); 1106 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1107 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1108 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1109 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1110 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1111 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1112 1113 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1114 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1115 CsrMatrix *matrixT= new CsrMatrix; 1116 matrixT->num_rows = A->cmap->n; 1117 matrixT->num_cols = A->rmap->n; 1118 matrixT->num_entries = a->nz; 1119 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1120 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1121 matrixT->values = new THRUSTARRAY(a->nz); 1122 1123 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 1124 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1125 1126 /* compute the transpose, i.e. the CSC */ 1127 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1128 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1129 A->cmap->n, matrix->num_entries, 1130 matrix->values->data().get(), 1131 cusparsestruct->rowoffsets_gpu->data().get(), 1132 matrix->column_indices->data().get(), 1133 matrixT->values->data().get(), 1134 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1135 CUSPARSE_ACTION_NUMERIC,indexBase, 1136 cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1137 err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err); 1138 #endif 1139 1140 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1141 A->cmap->n, matrix->num_entries, 1142 matrix->values->data().get(), 1143 cusparsestruct->rowoffsets_gpu->data().get(), 1144 matrix->column_indices->data().get(), 1145 matrixT->values->data().get(), 1146 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1147 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1148 CUSPARSE_ACTION_NUMERIC,indexBase, 1149 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1150 #else 1151 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1152 CUSPARSE_ACTION_NUMERIC, indexBase 1153 #endif 1154 );CHKERRCUSPARSE(stat); 1155 matstructT->mat = matrixT; 1156 1157 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1158 stat = cusparseCreateCsr(&matstructT->matDescr, 1159 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1160 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1161 matrixT->values->data().get(), 1162 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1163 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1164 #endif 1165 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1166 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1167 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1168 #else 1169 CsrMatrix *temp = new CsrMatrix; 1170 CsrMatrix *tempT = new CsrMatrix; 1171 /* First convert HYB to CSR */ 1172 temp->num_rows = A->rmap->n; 1173 temp->num_cols = A->cmap->n; 1174 temp->num_entries = a->nz; 1175 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1176 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1177 temp->values = new THRUSTARRAY(a->nz); 1178 1179 stat = cusparse_hyb2csr(cusparsestruct->handle, 1180 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1181 temp->values->data().get(), 1182 temp->row_offsets->data().get(), 1183 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1184 1185 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1186 tempT->num_rows = A->rmap->n; 1187 tempT->num_cols = A->cmap->n; 1188 tempT->num_entries = a->nz; 1189 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1190 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1191 tempT->values = new THRUSTARRAY(a->nz); 1192 1193 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1194 temp->num_cols, temp->num_entries, 1195 temp->values->data().get(), 1196 temp->row_offsets->data().get(), 1197 temp->column_indices->data().get(), 1198 tempT->values->data().get(), 1199 tempT->column_indices->data().get(), 1200 tempT->row_offsets->data().get(), 1201 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1202 1203 /* Last, convert CSC to HYB */ 1204 cusparseHybMat_t hybMat; 1205 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1206 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1207 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1208 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1209 matstructT->descr, tempT->values->data().get(), 1210 tempT->row_offsets->data().get(), 1211 tempT->column_indices->data().get(), 1212 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1213 1214 /* assign the pointer */ 1215 matstructT->mat = hybMat; 1216 /* delete temporaries */ 1217 if (tempT) { 1218 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1219 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1220 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1221 delete (CsrMatrix*) tempT; 1222 } 1223 if (temp) { 1224 if (temp->values) delete (THRUSTARRAY*) temp->values; 1225 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1226 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1227 delete (CsrMatrix*) temp; 1228 } 1229 #endif 1230 } 1231 err = WaitForCUDA();CHKERRCUDA(err); 1232 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1233 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1234 /* the compressed row indices is not used for matTranspose */ 1235 matstructT->cprowIndices = NULL; 1236 /* assign the pointer */ 1237 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1238 PetscFunctionReturn(0); 1239 } 1240 1241 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1242 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1243 { 1244 PetscInt n = xx->map->n; 1245 const PetscScalar *barray; 1246 PetscScalar *xarray; 1247 thrust::device_ptr<const PetscScalar> bGPU; 1248 thrust::device_ptr<PetscScalar> xGPU; 1249 cusparseStatus_t stat; 1250 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1251 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1252 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1253 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1254 PetscErrorCode ierr; 1255 cudaError_t cerr; 1256 1257 PetscFunctionBegin; 1258 /* Analyze the matrix and create the transpose ... on the fly */ 1259 if (!loTriFactorT && !upTriFactorT) { 1260 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1261 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1262 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1263 } 1264 1265 /* Get the GPU pointers */ 1266 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1267 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1268 xGPU = thrust::device_pointer_cast(xarray); 1269 bGPU = thrust::device_pointer_cast(barray); 1270 1271 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1272 /* First, reorder with the row permutation */ 1273 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1274 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1275 xGPU); 1276 1277 /* First, solve U */ 1278 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1279 upTriFactorT->csrMat->num_rows, 1280 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1281 upTriFactorT->csrMat->num_entries, 1282 #endif 1283 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1284 upTriFactorT->csrMat->values->data().get(), 1285 upTriFactorT->csrMat->row_offsets->data().get(), 1286 upTriFactorT->csrMat->column_indices->data().get(), 1287 upTriFactorT->solveInfo, 1288 xarray, tempGPU->data().get() 1289 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1290 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1291 #endif 1292 );CHKERRCUSPARSE(stat); 1293 1294 /* Then, solve L */ 1295 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1296 loTriFactorT->csrMat->num_rows, 1297 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1298 loTriFactorT->csrMat->num_entries, 1299 #endif 1300 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1301 loTriFactorT->csrMat->values->data().get(), 1302 loTriFactorT->csrMat->row_offsets->data().get(), 1303 loTriFactorT->csrMat->column_indices->data().get(), 1304 loTriFactorT->solveInfo, 1305 tempGPU->data().get(), xarray 1306 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1307 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1308 #endif 1309 );CHKERRCUSPARSE(stat); 1310 1311 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1312 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1313 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1314 tempGPU->begin()); 1315 1316 /* Copy the temporary to the full solution. */ 1317 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1318 1319 /* restore */ 1320 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1321 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1322 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1323 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1324 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1325 PetscFunctionReturn(0); 1326 } 1327 1328 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1329 { 1330 const PetscScalar *barray; 1331 PetscScalar *xarray; 1332 cusparseStatus_t stat; 1333 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1334 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1335 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1336 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1337 PetscErrorCode ierr; 1338 cudaError_t cerr; 1339 1340 PetscFunctionBegin; 1341 /* Analyze the matrix and create the transpose ... on the fly */ 1342 if (!loTriFactorT && !upTriFactorT) { 1343 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1344 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1345 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1346 } 1347 1348 /* Get the GPU pointers */ 1349 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1350 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1351 1352 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1353 /* First, solve U */ 1354 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1355 upTriFactorT->csrMat->num_rows, 1356 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1357 upTriFactorT->csrMat->num_entries, 1358 #endif 1359 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1360 upTriFactorT->csrMat->values->data().get(), 1361 upTriFactorT->csrMat->row_offsets->data().get(), 1362 upTriFactorT->csrMat->column_indices->data().get(), 1363 upTriFactorT->solveInfo, 1364 barray, tempGPU->data().get() 1365 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1366 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1367 #endif 1368 );CHKERRCUSPARSE(stat); 1369 1370 /* Then, solve L */ 1371 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1372 loTriFactorT->csrMat->num_rows, 1373 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1374 loTriFactorT->csrMat->num_entries, 1375 #endif 1376 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1377 loTriFactorT->csrMat->values->data().get(), 1378 loTriFactorT->csrMat->row_offsets->data().get(), 1379 loTriFactorT->csrMat->column_indices->data().get(), 1380 loTriFactorT->solveInfo, 1381 tempGPU->data().get(), xarray 1382 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1383 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1384 #endif 1385 );CHKERRCUSPARSE(stat); 1386 1387 /* restore */ 1388 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1389 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1390 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1391 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1392 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1393 PetscFunctionReturn(0); 1394 } 1395 1396 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1397 { 1398 const PetscScalar *barray; 1399 PetscScalar *xarray; 1400 thrust::device_ptr<const PetscScalar> bGPU; 1401 thrust::device_ptr<PetscScalar> xGPU; 1402 cusparseStatus_t stat; 1403 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1404 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1405 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1406 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1407 PetscErrorCode ierr; 1408 cudaError_t cerr; 1409 1410 PetscFunctionBegin; 1411 1412 /* Get the GPU pointers */ 1413 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1414 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1415 xGPU = thrust::device_pointer_cast(xarray); 1416 bGPU = thrust::device_pointer_cast(barray); 1417 1418 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1419 /* First, reorder with the row permutation */ 1420 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1421 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1422 tempGPU->begin()); 1423 1424 /* Next, solve L */ 1425 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1426 loTriFactor->csrMat->num_rows, 1427 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1428 loTriFactor->csrMat->num_entries, 1429 #endif 1430 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1431 loTriFactor->csrMat->values->data().get(), 1432 loTriFactor->csrMat->row_offsets->data().get(), 1433 loTriFactor->csrMat->column_indices->data().get(), 1434 loTriFactor->solveInfo, 1435 tempGPU->data().get(), xarray 1436 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1437 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1438 #endif 1439 );CHKERRCUSPARSE(stat); 1440 1441 /* Then, solve U */ 1442 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1443 upTriFactor->csrMat->num_rows, 1444 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1445 upTriFactor->csrMat->num_entries, 1446 #endif 1447 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1448 upTriFactor->csrMat->values->data().get(), 1449 upTriFactor->csrMat->row_offsets->data().get(), 1450 upTriFactor->csrMat->column_indices->data().get(), 1451 upTriFactor->solveInfo, 1452 xarray, tempGPU->data().get() 1453 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1454 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1455 #endif 1456 );CHKERRCUSPARSE(stat); 1457 1458 /* Last, reorder with the column permutation */ 1459 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1460 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1461 xGPU); 1462 1463 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1464 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1465 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1466 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1467 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1468 PetscFunctionReturn(0); 1469 } 1470 1471 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1472 { 1473 const PetscScalar *barray; 1474 PetscScalar *xarray; 1475 cusparseStatus_t stat; 1476 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1477 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1478 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1479 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1480 PetscErrorCode ierr; 1481 cudaError_t cerr; 1482 1483 PetscFunctionBegin; 1484 /* Get the GPU pointers */ 1485 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1486 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1487 1488 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1489 /* First, solve L */ 1490 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1491 loTriFactor->csrMat->num_rows, 1492 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1493 loTriFactor->csrMat->num_entries, 1494 #endif 1495 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1496 loTriFactor->csrMat->values->data().get(), 1497 loTriFactor->csrMat->row_offsets->data().get(), 1498 loTriFactor->csrMat->column_indices->data().get(), 1499 loTriFactor->solveInfo, 1500 barray, tempGPU->data().get() 1501 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1502 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1503 #endif 1504 );CHKERRCUSPARSE(stat); 1505 1506 /* Next, solve U */ 1507 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1508 upTriFactor->csrMat->num_rows, 1509 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1510 upTriFactor->csrMat->num_entries, 1511 #endif 1512 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1513 upTriFactor->csrMat->values->data().get(), 1514 upTriFactor->csrMat->row_offsets->data().get(), 1515 upTriFactor->csrMat->column_indices->data().get(), 1516 upTriFactor->solveInfo, 1517 tempGPU->data().get(), xarray 1518 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1519 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1520 #endif 1521 );CHKERRCUSPARSE(stat); 1522 1523 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1524 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1525 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1526 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1527 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1528 PetscFunctionReturn(0); 1529 } 1530 1531 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1532 { 1533 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1534 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1535 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1536 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1537 PetscErrorCode ierr; 1538 cusparseStatus_t stat; 1539 cudaError_t err; 1540 1541 PetscFunctionBegin; 1542 if (A->boundtocpu) PetscFunctionReturn(0); 1543 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1544 if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1545 /* Copy values only */ 1546 CsrMatrix *matrix,*matrixT; 1547 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1548 1549 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1550 matrix->values->assign(a->a, a->a+a->nz); 1551 err = WaitForCUDA();CHKERRCUDA(err); 1552 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1553 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1554 1555 /* Update matT when it was built before */ 1556 if (cusparsestruct->matTranspose) { 1557 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1558 matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1559 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1560 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1561 A->cmap->n, matrix->num_entries, 1562 matrix->values->data().get(), 1563 cusparsestruct->rowoffsets_gpu->data().get(), 1564 matrix->column_indices->data().get(), 1565 matrixT->values->data().get(), 1566 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1567 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1568 CUSPARSE_ACTION_NUMERIC,indexBase, 1569 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1570 #else 1571 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1572 CUSPARSE_ACTION_NUMERIC, indexBase 1573 #endif 1574 );CHKERRCUSPARSE(stat); 1575 err = WaitForCUDA();CHKERRCUDA(err); 1576 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1577 } 1578 } else { 1579 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1580 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1581 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1582 delete cusparsestruct->workVector; 1583 delete cusparsestruct->rowoffsets_gpu; 1584 try { 1585 if (a->compressedrow.use) { 1586 m = a->compressedrow.nrows; 1587 ii = a->compressedrow.i; 1588 ridx = a->compressedrow.rindex; 1589 } else { 1590 m = A->rmap->n; 1591 ii = a->i; 1592 ridx = NULL; 1593 } 1594 cusparsestruct->nrows = m; 1595 1596 /* create cusparse matrix */ 1597 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1598 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1599 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1600 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1601 1602 err = cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar));CHKERRCUDA(err); 1603 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1604 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1605 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1606 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1607 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1608 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1609 1610 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1611 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1612 /* set the matrix */ 1613 CsrMatrix *mat= new CsrMatrix; 1614 mat->num_rows = m; 1615 mat->num_cols = A->cmap->n; 1616 mat->num_entries = a->nz; 1617 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1618 mat->row_offsets->assign(ii, ii + m+1); 1619 1620 mat->column_indices = new THRUSTINTARRAY32(a->nz); 1621 mat->column_indices->assign(a->j, a->j+a->nz); 1622 1623 mat->values = new THRUSTARRAY(a->nz); 1624 mat->values->assign(a->a, a->a+a->nz); 1625 1626 /* assign the pointer */ 1627 matstruct->mat = mat; 1628 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1629 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1630 stat = cusparseCreateCsr(&matstruct->matDescr, 1631 mat->num_rows, mat->num_cols, mat->num_entries, 1632 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1633 mat->values->data().get(), 1634 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1635 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1636 } 1637 #endif 1638 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1639 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1640 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1641 #else 1642 CsrMatrix *mat= new CsrMatrix; 1643 mat->num_rows = m; 1644 mat->num_cols = A->cmap->n; 1645 mat->num_entries = a->nz; 1646 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1647 mat->row_offsets->assign(ii, ii + m+1); 1648 1649 mat->column_indices = new THRUSTINTARRAY32(a->nz); 1650 mat->column_indices->assign(a->j, a->j+a->nz); 1651 1652 mat->values = new THRUSTARRAY(a->nz); 1653 mat->values->assign(a->a, a->a+a->nz); 1654 1655 cusparseHybMat_t hybMat; 1656 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1657 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1658 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1659 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1660 matstruct->descr, mat->values->data().get(), 1661 mat->row_offsets->data().get(), 1662 mat->column_indices->data().get(), 1663 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1664 /* assign the pointer */ 1665 matstruct->mat = hybMat; 1666 1667 if (mat) { 1668 if (mat->values) delete (THRUSTARRAY*)mat->values; 1669 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1670 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1671 delete (CsrMatrix*)mat; 1672 } 1673 #endif 1674 } 1675 1676 /* assign the compressed row indices */ 1677 if (a->compressedrow.use) { 1678 cusparsestruct->workVector = new THRUSTARRAY(m); 1679 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1680 matstruct->cprowIndices->assign(ridx,ridx+m); 1681 tmp = m; 1682 } else { 1683 cusparsestruct->workVector = NULL; 1684 matstruct->cprowIndices = NULL; 1685 tmp = 0; 1686 } 1687 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1688 1689 /* assign the pointer */ 1690 cusparsestruct->mat = matstruct; 1691 } catch(char *ex) { 1692 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1693 } 1694 err = WaitForCUDA();CHKERRCUDA(err); 1695 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1696 cusparsestruct->nonzerostate = A->nonzerostate; 1697 } 1698 A->offloadmask = PETSC_OFFLOAD_BOTH; 1699 } 1700 PetscFunctionReturn(0); 1701 } 1702 1703 struct VecCUDAPlusEquals 1704 { 1705 template <typename Tuple> 1706 __host__ __device__ 1707 void operator()(Tuple t) 1708 { 1709 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1710 } 1711 }; 1712 1713 struct VecCUDAEqualsReverse 1714 { 1715 template <typename Tuple> 1716 __host__ __device__ 1717 void operator()(Tuple t) 1718 { 1719 thrust::get<0>(t) = thrust::get<1>(t); 1720 } 1721 }; 1722 1723 struct MatMatCusparse { 1724 PetscBool cisdense; 1725 PetscScalar *Bt; 1726 Mat X; 1727 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1728 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1729 cusparseDnMatDescr_t matBDescr; 1730 cusparseDnMatDescr_t matCDescr; 1731 size_t spmmBufferSize; 1732 void *spmmBuffer; 1733 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1734 #endif 1735 }; 1736 1737 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1738 { 1739 PetscErrorCode ierr; 1740 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1741 cudaError_t cerr; 1742 1743 PetscFunctionBegin; 1744 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1745 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1746 cusparseStatus_t stat; 1747 if (mmdata->matBDescr) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);} 1748 if (mmdata->matCDescr) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);} 1749 if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 1750 #endif 1751 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1752 ierr = PetscFree(data);CHKERRQ(ierr); 1753 PetscFunctionReturn(0); 1754 } 1755 1756 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1757 1758 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1759 { 1760 Mat_Product *product = C->product; 1761 Mat A,B; 1762 PetscInt m,n,blda,clda; 1763 PetscBool flg,biscuda; 1764 Mat_SeqAIJCUSPARSE *cusp; 1765 cusparseStatus_t stat; 1766 cusparseOperation_t opA; 1767 const PetscScalar *barray; 1768 PetscScalar *carray; 1769 PetscErrorCode ierr; 1770 MatMatCusparse *mmdata; 1771 Mat_SeqAIJCUSPARSEMultStruct *mat; 1772 CsrMatrix *csrmat; 1773 cudaError_t cerr; 1774 1775 PetscFunctionBegin; 1776 MatCheckProduct(C,1); 1777 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1778 mmdata = (MatMatCusparse*)product->data; 1779 A = product->A; 1780 B = product->B; 1781 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1782 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1783 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1784 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1785 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1786 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1787 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1788 switch (product->type) { 1789 case MATPRODUCT_AB: 1790 case MATPRODUCT_PtAP: 1791 mat = cusp->mat; 1792 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1793 m = A->rmap->n; 1794 n = B->cmap->n; 1795 break; 1796 case MATPRODUCT_AtB: 1797 if (!cusp->transgen) { 1798 mat = cusp->mat; 1799 opA = CUSPARSE_OPERATION_TRANSPOSE; 1800 } else { 1801 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1802 mat = cusp->matTranspose; 1803 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1804 } 1805 m = A->cmap->n; 1806 n = B->cmap->n; 1807 break; 1808 case MATPRODUCT_ABt: 1809 case MATPRODUCT_RARt: 1810 mat = cusp->mat; 1811 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1812 m = A->rmap->n; 1813 n = B->rmap->n; 1814 break; 1815 default: 1816 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1817 } 1818 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1819 csrmat = (CsrMatrix*)mat->mat; 1820 /* if the user passed a CPU matrix, copy the data to the GPU */ 1821 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1822 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 1823 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 1824 1825 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 1826 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1827 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1828 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 1829 } else { 1830 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 1831 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 1832 } 1833 1834 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1835 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1836 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 1837 /* (re)allcoate spmmBuffer if not initialized or LDAs are different */ 1838 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 1839 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 1840 if (!mmdata->matBDescr) { 1841 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 1842 mmdata->Blda = blda; 1843 } 1844 1845 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 1846 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 1847 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 1848 mmdata->Clda = clda; 1849 } 1850 1851 if (!mat->matDescr) { 1852 stat = cusparseCreateCsr(&mat->matDescr, 1853 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 1854 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 1855 csrmat->values->data().get(), 1856 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1857 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1858 } 1859 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 1860 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 1861 mmdata->matCDescr,cusparse_scalartype, 1862 cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat); 1863 if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 1864 cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr); 1865 mmdata->initialized = PETSC_TRUE; 1866 } else { 1867 /* to be safe, always update pointers of the mats */ 1868 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 1869 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 1870 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 1871 } 1872 1873 /* do cusparseSpMM, which supports transpose on B */ 1874 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 1875 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 1876 mmdata->matCDescr,cusparse_scalartype, 1877 cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat); 1878 #else 1879 PetscInt k; 1880 /* cusparseXcsrmm does not support transpose on B */ 1881 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1882 cublasHandle_t cublasv2handle; 1883 cublasStatus_t cerr; 1884 1885 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 1886 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 1887 B->cmap->n,B->rmap->n, 1888 &PETSC_CUSPARSE_ONE ,barray,blda, 1889 &PETSC_CUSPARSE_ZERO,barray,blda, 1890 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 1891 blda = B->cmap->n; 1892 k = B->cmap->n; 1893 } else { 1894 k = B->rmap->n; 1895 } 1896 1897 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 1898 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 1899 csrmat->num_entries,mat->alpha_one,mat->descr, 1900 csrmat->values->data().get(), 1901 csrmat->row_offsets->data().get(), 1902 csrmat->column_indices->data().get(), 1903 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 1904 carray,clda);CHKERRCUSPARSE(stat); 1905 #endif 1906 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1907 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1908 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 1909 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 1910 if (product->type == MATPRODUCT_RARt) { 1911 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1912 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1913 } else if (product->type == MATPRODUCT_PtAP) { 1914 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1915 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1916 } else { 1917 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 1918 } 1919 if (mmdata->cisdense) { 1920 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 1921 } 1922 if (!biscuda) { 1923 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 1924 } 1925 PetscFunctionReturn(0); 1926 } 1927 1928 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1929 { 1930 Mat_Product *product = C->product; 1931 Mat A,B; 1932 PetscInt m,n; 1933 PetscBool cisdense,flg; 1934 PetscErrorCode ierr; 1935 MatMatCusparse *mmdata; 1936 Mat_SeqAIJCUSPARSE *cusp; 1937 1938 PetscFunctionBegin; 1939 MatCheckProduct(C,1); 1940 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 1941 A = product->A; 1942 B = product->B; 1943 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1944 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1945 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1946 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 1947 switch (product->type) { 1948 case MATPRODUCT_AB: 1949 m = A->rmap->n; 1950 n = B->cmap->n; 1951 break; 1952 case MATPRODUCT_AtB: 1953 m = A->cmap->n; 1954 n = B->cmap->n; 1955 break; 1956 case MATPRODUCT_ABt: 1957 m = A->rmap->n; 1958 n = B->rmap->n; 1959 break; 1960 case MATPRODUCT_PtAP: 1961 m = B->cmap->n; 1962 n = B->cmap->n; 1963 break; 1964 case MATPRODUCT_RARt: 1965 m = B->rmap->n; 1966 n = B->rmap->n; 1967 break; 1968 default: 1969 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1970 } 1971 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 1972 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 1973 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 1974 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 1975 1976 /* product data */ 1977 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 1978 mmdata->cisdense = cisdense; 1979 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 1980 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 1981 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1982 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 1983 } 1984 #endif 1985 /* for these products we need intermediate storage */ 1986 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1987 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 1988 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 1989 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 1990 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 1991 } else { 1992 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 1993 } 1994 } 1995 C->product->data = mmdata; 1996 C->product->destroy = MatDestroy_MatMatCusparse; 1997 1998 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 1999 PetscFunctionReturn(0); 2000 } 2001 2002 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2003 2004 /* handles dense B */ 2005 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C) 2006 { 2007 Mat_Product *product = C->product; 2008 PetscErrorCode ierr; 2009 2010 PetscFunctionBegin; 2011 MatCheckProduct(C,1); 2012 if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 2013 if (product->A->boundtocpu) { 2014 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr); 2015 PetscFunctionReturn(0); 2016 } 2017 switch (product->type) { 2018 case MATPRODUCT_AB: 2019 case MATPRODUCT_AtB: 2020 case MATPRODUCT_ABt: 2021 case MATPRODUCT_PtAP: 2022 case MATPRODUCT_RARt: 2023 C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2024 default: 2025 break; 2026 } 2027 PetscFunctionReturn(0); 2028 } 2029 2030 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2031 { 2032 PetscErrorCode ierr; 2033 2034 PetscFunctionBegin; 2035 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2036 PetscFunctionReturn(0); 2037 } 2038 2039 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2040 { 2041 PetscErrorCode ierr; 2042 2043 PetscFunctionBegin; 2044 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2045 PetscFunctionReturn(0); 2046 } 2047 2048 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2049 { 2050 PetscErrorCode ierr; 2051 2052 PetscFunctionBegin; 2053 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2054 PetscFunctionReturn(0); 2055 } 2056 2057 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2058 { 2059 PetscErrorCode ierr; 2060 2061 PetscFunctionBegin; 2062 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2063 PetscFunctionReturn(0); 2064 } 2065 2066 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2067 { 2068 PetscErrorCode ierr; 2069 2070 PetscFunctionBegin; 2071 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2072 PetscFunctionReturn(0); 2073 } 2074 2075 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2076 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2077 { 2078 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2079 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2080 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2081 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2082 PetscErrorCode ierr; 2083 cudaError_t cerr; 2084 cusparseStatus_t stat; 2085 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2086 PetscBool compressed; 2087 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2088 PetscInt nx,ny; 2089 #endif 2090 2091 PetscFunctionBegin; 2092 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2093 if (!a->nonzerorowcnt) { 2094 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2095 PetscFunctionReturn(0); 2096 } 2097 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2098 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2099 if (!trans) { 2100 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2101 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2102 } else { 2103 if (herm || !cusparsestruct->transgen) { 2104 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2105 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2106 } else { 2107 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);} 2108 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2109 } 2110 } 2111 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2112 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2113 2114 try { 2115 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2116 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2117 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2118 2119 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2120 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2121 /* z = A x + beta y. 2122 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2123 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2124 */ 2125 xptr = xarray; 2126 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2127 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2128 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2129 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2130 allocated to accommodate different uses. So we get the length info directly from mat. 2131 */ 2132 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2133 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2134 nx = mat->num_cols; 2135 ny = mat->num_rows; 2136 } 2137 #endif 2138 } else { 2139 /* z = A^T x + beta y 2140 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2141 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2142 */ 2143 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2144 dptr = zarray; 2145 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2146 if (compressed) { /* Scatter x to work vector */ 2147 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2148 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2149 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2150 VecCUDAEqualsReverse()); 2151 } 2152 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2153 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2154 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2155 nx = mat->num_rows; 2156 ny = mat->num_cols; 2157 } 2158 #endif 2159 } 2160 2161 /* csr_spmv does y = alpha op(A) x + beta y */ 2162 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2163 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2164 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2165 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2166 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2167 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2168 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2169 matstruct->matDescr, 2170 matstruct->cuSpMV[opA].vecXDescr, beta, 2171 matstruct->cuSpMV[opA].vecYDescr, 2172 cusparse_scalartype, 2173 cusparsestruct->spmvAlg, 2174 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2175 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2176 2177 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2178 } else { 2179 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2180 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2181 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2182 } 2183 2184 stat = cusparseSpMV(cusparsestruct->handle, opA, 2185 matstruct->alpha_one, 2186 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */ 2187 matstruct->cuSpMV[opA].vecXDescr, 2188 beta, 2189 matstruct->cuSpMV[opA].vecYDescr, 2190 cusparse_scalartype, 2191 cusparsestruct->spmvAlg, 2192 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2193 #else 2194 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2195 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2196 mat->num_rows, mat->num_cols, 2197 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2198 mat->values->data().get(), mat->row_offsets->data().get(), 2199 mat->column_indices->data().get(), xptr, beta, 2200 dptr);CHKERRCUSPARSE(stat); 2201 #endif 2202 } else { 2203 if (cusparsestruct->nrows) { 2204 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2205 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2206 #else 2207 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2208 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2209 matstruct->alpha_one, matstruct->descr, hybMat, 2210 xptr, beta, 2211 dptr);CHKERRCUSPARSE(stat); 2212 #endif 2213 } 2214 } 2215 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2216 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2217 2218 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2219 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2220 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2221 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2222 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2223 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2224 } 2225 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2226 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2227 } 2228 2229 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2230 if (compressed) { 2231 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2232 2233 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2234 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2235 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2236 VecCUDAPlusEquals()); 2237 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2238 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2239 } 2240 } else { 2241 if (yy && yy != zz) { 2242 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2243 } 2244 } 2245 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2246 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2247 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2248 } catch(char *ex) { 2249 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2250 } 2251 if (yy) { 2252 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2253 } else { 2254 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2255 } 2256 PetscFunctionReturn(0); 2257 } 2258 2259 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2260 { 2261 PetscErrorCode ierr; 2262 2263 PetscFunctionBegin; 2264 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2265 PetscFunctionReturn(0); 2266 } 2267 2268 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2269 { 2270 PetscErrorCode ierr; 2271 PetscSplitCSRDataStructure *d_mat = NULL, h_mat; 2272 PetscBool is_seq = PETSC_TRUE; 2273 PetscInt nnz_state = A->nonzerostate; 2274 PetscFunctionBegin; 2275 if (A->factortype == MAT_FACTOR_NONE) { 2276 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2277 } 2278 if (d_mat) { 2279 cudaError_t err; 2280 ierr = PetscInfo(A,"Assemble device matrix\n");CHKERRQ(ierr); 2281 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2282 nnz_state = h_mat.nonzerostate; 2283 is_seq = h_mat.seq; 2284 } 2285 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2286 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2287 if (A->factortype == MAT_FACTOR_NONE && A->nonzerostate >= nnz_state && is_seq) { // assembled on CPU eventhough equiped for GPU 2288 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2289 } else if (nnz_state > A->nonzerostate) { 2290 A->offloadmask = PETSC_OFFLOAD_GPU; 2291 } 2292 2293 PetscFunctionReturn(0); 2294 } 2295 2296 /* --------------------------------------------------------------------------------*/ 2297 /*@ 2298 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2299 (the default parallel PETSc format). This matrix will ultimately pushed down 2300 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2301 assembly performance the user should preallocate the matrix storage by setting 2302 the parameter nz (or the array nnz). By setting these parameters accurately, 2303 performance during matrix assembly can be increased by more than a factor of 50. 2304 2305 Collective 2306 2307 Input Parameters: 2308 + comm - MPI communicator, set to PETSC_COMM_SELF 2309 . m - number of rows 2310 . n - number of columns 2311 . nz - number of nonzeros per row (same for all rows) 2312 - nnz - array containing the number of nonzeros in the various rows 2313 (possibly different for each row) or NULL 2314 2315 Output Parameter: 2316 . A - the matrix 2317 2318 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2319 MatXXXXSetPreallocation() paradgm instead of this routine directly. 2320 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 2321 2322 Notes: 2323 If nnz is given then nz is ignored 2324 2325 The AIJ format (also called the Yale sparse matrix format or 2326 compressed row storage), is fully compatible with standard Fortran 77 2327 storage. That is, the stored row and column indices can begin at 2328 either one (as in Fortran) or zero. See the users' manual for details. 2329 2330 Specify the preallocated storage with either nz or nnz (not both). 2331 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 2332 allocation. For large problems you MUST preallocate memory or you 2333 will get TERRIBLE performance, see the users' manual chapter on matrices. 2334 2335 By default, this format uses inodes (identical nodes) when possible, to 2336 improve numerical efficiency of matrix-vector products and solves. We 2337 search for consecutive rows with the same nonzero structure, thereby 2338 reusing matrix information to achieve increased efficiency. 2339 2340 Level: intermediate 2341 2342 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 2343 @*/ 2344 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 2345 { 2346 PetscErrorCode ierr; 2347 2348 PetscFunctionBegin; 2349 ierr = MatCreate(comm,A);CHKERRQ(ierr); 2350 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 2351 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2352 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 2353 PetscFunctionReturn(0); 2354 } 2355 2356 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 2357 { 2358 PetscErrorCode ierr; 2359 PetscSplitCSRDataStructure *d_mat = NULL; 2360 2361 PetscFunctionBegin; 2362 if (A->factortype == MAT_FACTOR_NONE) { 2363 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2364 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 2365 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 2366 } else { 2367 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 2368 } 2369 if (d_mat) { 2370 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2371 cudaError_t err; 2372 PetscSplitCSRDataStructure h_mat; 2373 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 2374 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2375 if (h_mat.seq) { 2376 if (a->compressedrow.use) { 2377 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 2378 } 2379 err = cudaFree(d_mat);CHKERRCUDA(err); 2380 } 2381 } 2382 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 2383 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2384 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 2385 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 2386 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 2387 PetscFunctionReturn(0); 2388 } 2389 2390 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 2391 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 2392 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 2393 { 2394 PetscErrorCode ierr; 2395 2396 PetscFunctionBegin; 2397 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 2398 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 2399 PetscFunctionReturn(0); 2400 } 2401 2402 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 2403 { 2404 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2405 PetscErrorCode ierr; 2406 2407 PetscFunctionBegin; 2408 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 2409 /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU. 2410 If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one. 2411 Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case. 2412 TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries; 2413 can follow the example of MatBindToCPU_SeqAIJViennaCL(). */ 2414 if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov"); 2415 /* TODO: add support for this? */ 2416 if (flg) { 2417 A->ops->mult = MatMult_SeqAIJ; 2418 A->ops->multadd = MatMultAdd_SeqAIJ; 2419 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 2420 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 2421 A->ops->multhermitiantranspose = NULL; 2422 A->ops->multhermitiantransposeadd = NULL; 2423 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2424 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 2425 } else { 2426 A->ops->mult = MatMult_SeqAIJCUSPARSE; 2427 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 2428 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 2429 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 2430 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 2431 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 2432 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2433 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2434 } 2435 A->boundtocpu = flg; 2436 a->inode.use = flg; 2437 PetscFunctionReturn(0); 2438 } 2439 2440 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 2441 { 2442 PetscSplitCSRDataStructure *d_mat = NULL; 2443 PetscErrorCode ierr; 2444 PetscFunctionBegin; 2445 if (A->factortype == MAT_FACTOR_NONE) { 2446 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 2447 d_mat = spptr->deviceMat; 2448 } 2449 if (d_mat) { 2450 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2451 PetscInt n = A->rmap->n, nnz = a->i[n]; 2452 cudaError_t err; 2453 PetscScalar *vals; 2454 ierr = PetscInfo(A,"Zero device matrix\n");CHKERRQ(ierr); 2455 err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2456 err = cudaMemset( vals, 0, (nnz)*sizeof(PetscScalar));CHKERRCUDA(err); 2457 } 2458 ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 2459 2460 PetscFunctionReturn(0); 2461 } 2462 2463 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 2464 { 2465 PetscErrorCode ierr; 2466 cusparseStatus_t stat; 2467 Mat B; 2468 2469 PetscFunctionBegin; 2470 if (reuse == MAT_INITIAL_MATRIX) { 2471 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 2472 } else if (reuse == MAT_REUSE_MATRIX) { 2473 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 2474 } 2475 B = *newmat; 2476 2477 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 2478 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 2479 2480 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 2481 if (B->factortype == MAT_FACTOR_NONE) { 2482 Mat_SeqAIJCUSPARSE *spptr; 2483 2484 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2485 spptr->format = MAT_CUSPARSE_CSR; 2486 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2487 B->spptr = spptr; 2488 spptr->deviceMat = NULL; 2489 } else { 2490 Mat_SeqAIJCUSPARSETriFactors *spptr; 2491 2492 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2493 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2494 B->spptr = spptr; 2495 } 2496 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2497 } 2498 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 2499 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 2500 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 2501 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 2502 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 2503 B->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 2504 2505 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 2506 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2507 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 2508 PetscFunctionReturn(0); 2509 } 2510 2511 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 2512 { 2513 PetscErrorCode ierr; 2514 2515 PetscFunctionBegin; 2516 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 2517 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 2518 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2519 ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr); 2520 ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr); 2521 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2522 PetscFunctionReturn(0); 2523 } 2524 2525 /*MC 2526 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 2527 2528 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 2529 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 2530 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 2531 2532 Options Database Keys: 2533 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 2534 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2535 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2536 2537 Level: beginner 2538 2539 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 2540 M*/ 2541 2542 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 2543 2544 2545 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 2546 { 2547 PetscErrorCode ierr; 2548 2549 PetscFunctionBegin; 2550 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2551 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2552 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2553 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2554 PetscFunctionReturn(0); 2555 } 2556 2557 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 2558 { 2559 PetscErrorCode ierr; 2560 cusparseStatus_t stat; 2561 cusparseHandle_t handle; 2562 2563 PetscFunctionBegin; 2564 if (*cusparsestruct) { 2565 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 2566 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 2567 delete (*cusparsestruct)->workVector; 2568 delete (*cusparsestruct)->rowoffsets_gpu; 2569 if (handle = (*cusparsestruct)->handle) {stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);} 2570 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2571 cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr); 2572 #endif 2573 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 2574 } 2575 PetscFunctionReturn(0); 2576 } 2577 2578 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 2579 { 2580 PetscFunctionBegin; 2581 if (*mat) { 2582 delete (*mat)->values; 2583 delete (*mat)->column_indices; 2584 delete (*mat)->row_offsets; 2585 delete *mat; 2586 *mat = 0; 2587 } 2588 PetscFunctionReturn(0); 2589 } 2590 2591 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 2592 { 2593 cusparseStatus_t stat; 2594 PetscErrorCode ierr; 2595 2596 PetscFunctionBegin; 2597 if (*trifactor) { 2598 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 2599 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 2600 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 2601 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2602 cudaError_t cerr; 2603 if ((*trifactor)->solveBuffer) {cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 2604 if ((*trifactor)->csr2cscBuffer) {cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 2605 #endif 2606 delete *trifactor; 2607 *trifactor = 0; 2608 } 2609 PetscFunctionReturn(0); 2610 } 2611 2612 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 2613 { 2614 CsrMatrix *mat; 2615 cusparseStatus_t stat; 2616 cudaError_t err; 2617 2618 PetscFunctionBegin; 2619 if (*matstruct) { 2620 if ((*matstruct)->mat) { 2621 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 2622 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2623 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2624 #else 2625 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 2626 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 2627 #endif 2628 } else { 2629 mat = (CsrMatrix*)(*matstruct)->mat; 2630 CsrMatrix_Destroy(&mat); 2631 } 2632 } 2633 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 2634 delete (*matstruct)->cprowIndices; 2635 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 2636 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 2637 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 2638 2639 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2640 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 2641 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 2642 for (int i=0; i<3; i++) { 2643 if (mdata->cuSpMV[i].initialized) { 2644 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 2645 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 2646 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 2647 } 2648 } 2649 #endif 2650 delete *matstruct; 2651 *matstruct = 0; 2652 } 2653 PetscFunctionReturn(0); 2654 } 2655 2656 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2657 { 2658 PetscErrorCode ierr; 2659 2660 PetscFunctionBegin; 2661 if (*trifactors) { 2662 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 2663 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 2664 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 2665 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 2666 delete (*trifactors)->rpermIndices; 2667 delete (*trifactors)->cpermIndices; 2668 delete (*trifactors)->workVector; 2669 (*trifactors)->rpermIndices = 0; 2670 (*trifactors)->cpermIndices = 0; 2671 (*trifactors)->workVector = 0; 2672 } 2673 PetscFunctionReturn(0); 2674 } 2675 2676 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2677 { 2678 PetscErrorCode ierr; 2679 cusparseHandle_t handle; 2680 cusparseStatus_t stat; 2681 2682 PetscFunctionBegin; 2683 if (*trifactors) { 2684 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 2685 if (handle = (*trifactors)->handle) { 2686 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 2687 } 2688 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 2689 } 2690 PetscFunctionReturn(0); 2691 } 2692