1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 68 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 69 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 74 75 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 76 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 78 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 80 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 81 82 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 83 { 84 cusparseStatus_t stat; 85 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 86 87 PetscFunctionBegin; 88 cusparsestruct->stream = stream; 89 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 90 PetscFunctionReturn(0); 91 } 92 93 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 94 { 95 cusparseStatus_t stat; 96 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 97 98 PetscFunctionBegin; 99 if (cusparsestruct->handle != handle) { 100 if (cusparsestruct->handle) { 101 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 102 } 103 cusparsestruct->handle = handle; 104 } 105 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 106 PetscFunctionReturn(0); 107 } 108 109 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 110 { 111 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 112 113 PetscFunctionBegin; 114 if (cusparsestruct->handle) cusparsestruct->handle = 0; 115 PetscFunctionReturn(0); 116 } 117 118 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 119 { 120 PetscFunctionBegin; 121 *type = MATSOLVERCUSPARSE; 122 PetscFunctionReturn(0); 123 } 124 125 /*MC 126 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 127 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 128 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 129 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 130 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 131 algorithms are not recommended. This class does NOT support direct solver operations. 132 133 Level: beginner 134 135 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 136 M*/ 137 138 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 139 { 140 PetscErrorCode ierr; 141 PetscInt n = A->rmap->n; 142 143 PetscFunctionBegin; 144 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 145 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 146 (*B)->factortype = ftype; 147 (*B)->useordering = PETSC_TRUE; 148 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 149 150 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 151 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 152 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 153 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 154 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 155 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 156 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 157 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 158 159 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 160 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 161 PetscFunctionReturn(0); 162 } 163 164 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 165 { 166 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 167 168 PetscFunctionBegin; 169 switch (op) { 170 case MAT_CUSPARSE_MULT: 171 cusparsestruct->format = format; 172 break; 173 case MAT_CUSPARSE_ALL: 174 cusparsestruct->format = format; 175 break; 176 default: 177 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 178 } 179 PetscFunctionReturn(0); 180 } 181 182 /*@ 183 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 184 operation. Only the MatMult operation can use different GPU storage formats 185 for MPIAIJCUSPARSE matrices. 186 Not Collective 187 188 Input Parameters: 189 + A - Matrix of type SEQAIJCUSPARSE 190 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 191 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 192 193 Output Parameter: 194 195 Level: intermediate 196 197 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 198 @*/ 199 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 200 { 201 PetscErrorCode ierr; 202 203 PetscFunctionBegin; 204 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 205 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 206 PetscFunctionReturn(0); 207 } 208 209 /*@ 210 MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose 211 212 Collective on mat 213 214 Input Parameters: 215 + A - Matrix of type SEQAIJCUSPARSE 216 - transgen - the boolean flag 217 218 Level: intermediate 219 220 .seealso: MATSEQAIJCUSPARSE 221 @*/ 222 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 223 { 224 PetscErrorCode ierr; 225 PetscBool flg; 226 227 PetscFunctionBegin; 228 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 229 ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 230 if (flg) { 231 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 232 233 if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 234 cusp->transgen = transgen; 235 if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 236 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 237 } 238 } 239 PetscFunctionReturn(0); 240 } 241 242 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 243 { 244 PetscErrorCode ierr; 245 MatCUSPARSEStorageFormat format; 246 PetscBool flg; 247 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 248 249 PetscFunctionBegin; 250 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 251 if (A->factortype == MAT_FACTOR_NONE) { 252 PetscBool transgen = cusparsestruct->transgen; 253 254 ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 255 if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);} 256 257 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 258 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 259 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 260 261 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 262 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 263 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 264 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 265 cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 266 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 267 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 268 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 269 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 270 271 cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 272 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 273 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 274 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 275 276 cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 277 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 278 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 279 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 280 #endif 281 } 282 ierr = PetscOptionsTail();CHKERRQ(ierr); 283 PetscFunctionReturn(0); 284 } 285 286 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 287 { 288 PetscErrorCode ierr; 289 290 PetscFunctionBegin; 291 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 292 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 293 PetscFunctionReturn(0); 294 } 295 296 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 297 { 298 PetscErrorCode ierr; 299 300 PetscFunctionBegin; 301 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 302 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 303 PetscFunctionReturn(0); 304 } 305 306 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 307 { 308 PetscErrorCode ierr; 309 310 PetscFunctionBegin; 311 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 312 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 313 PetscFunctionReturn(0); 314 } 315 316 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 317 { 318 PetscErrorCode ierr; 319 320 PetscFunctionBegin; 321 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 322 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 323 PetscFunctionReturn(0); 324 } 325 326 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 327 { 328 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 329 PetscInt n = A->rmap->n; 330 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 331 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 332 cusparseStatus_t stat; 333 const PetscInt *ai = a->i,*aj = a->j,*vi; 334 const MatScalar *aa = a->a,*v; 335 PetscInt *AiLo, *AjLo; 336 PetscScalar *AALo; 337 PetscInt i,nz, nzLower, offset, rowOffset; 338 PetscErrorCode ierr; 339 cudaError_t cerr; 340 341 PetscFunctionBegin; 342 if (!n) PetscFunctionReturn(0); 343 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 344 try { 345 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 346 nzLower=n+ai[n]-ai[1]; 347 348 /* Allocate Space for the lower triangular matrix */ 349 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 350 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 351 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 352 353 /* Fill the lower triangular matrix */ 354 AiLo[0] = (PetscInt) 0; 355 AiLo[n] = nzLower; 356 AjLo[0] = (PetscInt) 0; 357 AALo[0] = (MatScalar) 1.0; 358 v = aa; 359 vi = aj; 360 offset = 1; 361 rowOffset= 1; 362 for (i=1; i<n; i++) { 363 nz = ai[i+1] - ai[i]; 364 /* additional 1 for the term on the diagonal */ 365 AiLo[i] = rowOffset; 366 rowOffset += nz+1; 367 368 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 369 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 370 371 offset += nz; 372 AjLo[offset] = (PetscInt) i; 373 AALo[offset] = (MatScalar) 1.0; 374 offset += 1; 375 376 v += nz; 377 vi += nz; 378 } 379 380 /* allocate space for the triangular factor information */ 381 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 382 383 /* Create the matrix description */ 384 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 385 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 386 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 387 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 388 #else 389 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 390 #endif 391 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 392 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 393 394 /* set the operation */ 395 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 396 397 /* set the matrix */ 398 loTriFactor->csrMat = new CsrMatrix; 399 loTriFactor->csrMat->num_rows = n; 400 loTriFactor->csrMat->num_cols = n; 401 loTriFactor->csrMat->num_entries = nzLower; 402 403 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 404 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 405 406 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 407 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 408 409 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 410 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 411 412 /* Create the solve analysis information */ 413 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 414 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 415 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 416 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 417 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 418 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 419 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 420 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 421 #endif 422 423 /* perform the solve analysis */ 424 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 425 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 426 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 427 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 428 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 429 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 430 #endif 431 );CHKERRCUSPARSE(stat); 432 433 /* assign the pointer. Is this really necessary? */ 434 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 435 436 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 437 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 438 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 439 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 440 } catch(char *ex) { 441 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 442 } 443 } 444 PetscFunctionReturn(0); 445 } 446 447 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 448 { 449 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 450 PetscInt n = A->rmap->n; 451 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 452 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 453 cusparseStatus_t stat; 454 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 455 const MatScalar *aa = a->a,*v; 456 PetscInt *AiUp, *AjUp; 457 PetscScalar *AAUp; 458 PetscInt i,nz, nzUpper, offset; 459 PetscErrorCode ierr; 460 cudaError_t cerr; 461 462 PetscFunctionBegin; 463 if (!n) PetscFunctionReturn(0); 464 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 465 try { 466 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 467 nzUpper = adiag[0]-adiag[n]; 468 469 /* Allocate Space for the upper triangular matrix */ 470 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 471 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 472 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 473 474 /* Fill the upper triangular matrix */ 475 AiUp[0]=(PetscInt) 0; 476 AiUp[n]=nzUpper; 477 offset = nzUpper; 478 for (i=n-1; i>=0; i--) { 479 v = aa + adiag[i+1] + 1; 480 vi = aj + adiag[i+1] + 1; 481 482 /* number of elements NOT on the diagonal */ 483 nz = adiag[i] - adiag[i+1]-1; 484 485 /* decrement the offset */ 486 offset -= (nz+1); 487 488 /* first, set the diagonal elements */ 489 AjUp[offset] = (PetscInt) i; 490 AAUp[offset] = (MatScalar)1./v[nz]; 491 AiUp[i] = AiUp[i+1] - (nz+1); 492 493 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 494 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 495 } 496 497 /* allocate space for the triangular factor information */ 498 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 499 500 /* Create the matrix description */ 501 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 502 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 503 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 504 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 505 #else 506 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 507 #endif 508 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 509 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 510 511 /* set the operation */ 512 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 513 514 /* set the matrix */ 515 upTriFactor->csrMat = new CsrMatrix; 516 upTriFactor->csrMat->num_rows = n; 517 upTriFactor->csrMat->num_cols = n; 518 upTriFactor->csrMat->num_entries = nzUpper; 519 520 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 521 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 522 523 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 524 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 525 526 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 527 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 528 529 /* Create the solve analysis information */ 530 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 531 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 532 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 533 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 534 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 535 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 536 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 537 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 538 #endif 539 540 /* perform the solve analysis */ 541 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 542 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 543 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 544 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 545 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 546 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 547 #endif 548 );CHKERRCUSPARSE(stat); 549 550 /* assign the pointer. Is this really necessary? */ 551 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 552 553 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 554 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 555 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 556 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 557 } catch(char *ex) { 558 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 559 } 560 } 561 PetscFunctionReturn(0); 562 } 563 564 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 565 { 566 PetscErrorCode ierr; 567 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 568 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 569 IS isrow = a->row,iscol = a->icol; 570 PetscBool row_identity,col_identity; 571 const PetscInt *r,*c; 572 PetscInt n = A->rmap->n; 573 574 PetscFunctionBegin; 575 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 576 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 577 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 578 579 cusparseTriFactors->workVector = new THRUSTARRAY(n); 580 cusparseTriFactors->nnz=a->nz; 581 582 A->offloadmask = PETSC_OFFLOAD_BOTH; 583 /* lower triangular indices */ 584 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 585 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 586 if (!row_identity) { 587 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 588 cusparseTriFactors->rpermIndices->assign(r, r+n); 589 } 590 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 591 592 /* upper triangular indices */ 593 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 594 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 595 if (!col_identity) { 596 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 597 cusparseTriFactors->cpermIndices->assign(c, c+n); 598 } 599 600 if (!row_identity && !col_identity) { 601 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 602 } else if(!row_identity) { 603 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 604 } else if(!col_identity) { 605 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 606 } 607 608 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 609 PetscFunctionReturn(0); 610 } 611 612 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 613 { 614 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 615 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 616 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 617 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 618 cusparseStatus_t stat; 619 PetscErrorCode ierr; 620 cudaError_t cerr; 621 PetscInt *AiUp, *AjUp; 622 PetscScalar *AAUp; 623 PetscScalar *AALo; 624 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 625 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 626 const PetscInt *ai = b->i,*aj = b->j,*vj; 627 const MatScalar *aa = b->a,*v; 628 629 PetscFunctionBegin; 630 if (!n) PetscFunctionReturn(0); 631 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 632 try { 633 /* Allocate Space for the upper triangular matrix */ 634 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 635 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 636 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 637 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 638 639 /* Fill the upper triangular matrix */ 640 AiUp[0]=(PetscInt) 0; 641 AiUp[n]=nzUpper; 642 offset = 0; 643 for (i=0; i<n; i++) { 644 /* set the pointers */ 645 v = aa + ai[i]; 646 vj = aj + ai[i]; 647 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 648 649 /* first, set the diagonal elements */ 650 AjUp[offset] = (PetscInt) i; 651 AAUp[offset] = (MatScalar)1.0/v[nz]; 652 AiUp[i] = offset; 653 AALo[offset] = (MatScalar)1.0/v[nz]; 654 655 offset+=1; 656 if (nz>0) { 657 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 658 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 659 for (j=offset; j<offset+nz; j++) { 660 AAUp[j] = -AAUp[j]; 661 AALo[j] = AAUp[j]/v[nz]; 662 } 663 offset+=nz; 664 } 665 } 666 667 /* allocate space for the triangular factor information */ 668 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 669 670 /* Create the matrix description */ 671 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 672 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 673 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 674 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 675 #else 676 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 677 #endif 678 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 679 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 680 681 /* set the matrix */ 682 upTriFactor->csrMat = new CsrMatrix; 683 upTriFactor->csrMat->num_rows = A->rmap->n; 684 upTriFactor->csrMat->num_cols = A->cmap->n; 685 upTriFactor->csrMat->num_entries = a->nz; 686 687 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 688 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 689 690 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 691 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 692 693 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 694 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 695 696 /* set the operation */ 697 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 698 699 /* Create the solve analysis information */ 700 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 701 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 702 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 703 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 704 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 705 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 706 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 707 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 708 #endif 709 710 /* perform the solve analysis */ 711 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 712 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 713 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 714 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 715 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 716 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 717 #endif 718 );CHKERRCUSPARSE(stat); 719 720 /* assign the pointer. Is this really necessary? */ 721 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 722 723 /* allocate space for the triangular factor information */ 724 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 725 726 /* Create the matrix description */ 727 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 728 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 729 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 730 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 731 #else 732 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 733 #endif 734 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 735 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 736 737 /* set the operation */ 738 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 739 740 /* set the matrix */ 741 loTriFactor->csrMat = new CsrMatrix; 742 loTriFactor->csrMat->num_rows = A->rmap->n; 743 loTriFactor->csrMat->num_cols = A->cmap->n; 744 loTriFactor->csrMat->num_entries = a->nz; 745 746 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 747 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 748 749 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 750 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 751 752 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 753 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 754 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 755 756 /* Create the solve analysis information */ 757 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 758 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 759 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 760 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 761 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 762 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 763 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 764 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 765 #endif 766 767 /* perform the solve analysis */ 768 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 769 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 770 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 771 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 772 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 773 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 774 #endif 775 );CHKERRCUSPARSE(stat); 776 777 /* assign the pointer. Is this really necessary? */ 778 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 779 780 A->offloadmask = PETSC_OFFLOAD_BOTH; 781 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 782 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 783 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 784 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 785 } catch(char *ex) { 786 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 787 } 788 } 789 PetscFunctionReturn(0); 790 } 791 792 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 793 { 794 PetscErrorCode ierr; 795 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 796 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 797 IS ip = a->row; 798 const PetscInt *rip; 799 PetscBool perm_identity; 800 PetscInt n = A->rmap->n; 801 802 PetscFunctionBegin; 803 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 804 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 805 cusparseTriFactors->workVector = new THRUSTARRAY(n); 806 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 807 808 /* lower triangular indices */ 809 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 810 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 811 if (!perm_identity) { 812 IS iip; 813 const PetscInt *irip; 814 815 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 816 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 817 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 818 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 819 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 820 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 821 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 822 ierr = ISDestroy(&iip);CHKERRQ(ierr); 823 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 824 } 825 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 826 PetscFunctionReturn(0); 827 } 828 829 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 830 { 831 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 832 IS isrow = b->row,iscol = b->col; 833 PetscBool row_identity,col_identity; 834 PetscErrorCode ierr; 835 836 PetscFunctionBegin; 837 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 838 B->offloadmask = PETSC_OFFLOAD_CPU; 839 /* determine which version of MatSolve needs to be used. */ 840 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 841 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 842 if (row_identity && col_identity) { 843 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 844 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 845 B->ops->matsolve = NULL; 846 B->ops->matsolvetranspose = NULL; 847 } else { 848 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 849 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 850 B->ops->matsolve = NULL; 851 B->ops->matsolvetranspose = NULL; 852 } 853 854 /* get the triangular factors */ 855 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 856 PetscFunctionReturn(0); 857 } 858 859 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 860 { 861 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 862 IS ip = b->row; 863 PetscBool perm_identity; 864 PetscErrorCode ierr; 865 866 PetscFunctionBegin; 867 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 868 B->offloadmask = PETSC_OFFLOAD_CPU; 869 /* determine which version of MatSolve needs to be used. */ 870 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 871 if (perm_identity) { 872 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 873 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 874 B->ops->matsolve = NULL; 875 B->ops->matsolvetranspose = NULL; 876 } else { 877 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 878 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 879 B->ops->matsolve = NULL; 880 B->ops->matsolvetranspose = NULL; 881 } 882 883 /* get the triangular factors */ 884 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 885 PetscFunctionReturn(0); 886 } 887 888 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 889 { 890 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 891 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 892 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 893 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 894 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 895 cusparseStatus_t stat; 896 cusparseIndexBase_t indexBase; 897 cusparseMatrixType_t matrixType; 898 cusparseFillMode_t fillMode; 899 cusparseDiagType_t diagType; 900 901 PetscFunctionBegin; 902 903 /*********************************************/ 904 /* Now the Transpose of the Lower Tri Factor */ 905 /*********************************************/ 906 907 /* allocate space for the transpose of the lower triangular factor */ 908 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 909 910 /* set the matrix descriptors of the lower triangular factor */ 911 matrixType = cusparseGetMatType(loTriFactor->descr); 912 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 913 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 914 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 915 diagType = cusparseGetMatDiagType(loTriFactor->descr); 916 917 /* Create the matrix description */ 918 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 919 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 920 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 921 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 922 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 923 924 /* set the operation */ 925 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 926 927 /* allocate GPU space for the CSC of the lower triangular factor*/ 928 loTriFactorT->csrMat = new CsrMatrix; 929 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 930 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 931 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 932 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 933 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 934 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 935 936 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 937 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 938 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 939 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 940 loTriFactor->csrMat->values->data().get(), 941 loTriFactor->csrMat->row_offsets->data().get(), 942 loTriFactor->csrMat->column_indices->data().get(), 943 loTriFactorT->csrMat->values->data().get(), 944 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 945 CUSPARSE_ACTION_NUMERIC,indexBase, 946 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 947 cudaError_t cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 948 #endif 949 950 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 951 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 952 loTriFactor->csrMat->values->data().get(), 953 loTriFactor->csrMat->row_offsets->data().get(), 954 loTriFactor->csrMat->column_indices->data().get(), 955 loTriFactorT->csrMat->values->data().get(), 956 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 957 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 958 CUSPARSE_ACTION_NUMERIC, indexBase, 959 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 960 #else 961 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 962 CUSPARSE_ACTION_NUMERIC, indexBase 963 #endif 964 );CHKERRCUSPARSE(stat); 965 966 /* Create the solve analysis information */ 967 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 968 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 969 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 970 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 971 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 972 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 973 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 974 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 975 #endif 976 977 /* perform the solve analysis */ 978 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 979 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 980 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 981 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 982 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 983 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 984 #endif 985 );CHKERRCUSPARSE(stat); 986 987 /* assign the pointer. Is this really necessary? */ 988 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 989 990 /*********************************************/ 991 /* Now the Transpose of the Upper Tri Factor */ 992 /*********************************************/ 993 994 /* allocate space for the transpose of the upper triangular factor */ 995 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 996 997 /* set the matrix descriptors of the upper triangular factor */ 998 matrixType = cusparseGetMatType(upTriFactor->descr); 999 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1000 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1001 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1002 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1003 1004 /* Create the matrix description */ 1005 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1006 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1007 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1008 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1009 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1010 1011 /* set the operation */ 1012 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1013 1014 /* allocate GPU space for the CSC of the upper triangular factor*/ 1015 upTriFactorT->csrMat = new CsrMatrix; 1016 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1017 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1018 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1019 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1020 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1021 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1022 1023 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1024 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1025 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1026 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1027 upTriFactor->csrMat->values->data().get(), 1028 upTriFactor->csrMat->row_offsets->data().get(), 1029 upTriFactor->csrMat->column_indices->data().get(), 1030 upTriFactorT->csrMat->values->data().get(), 1031 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1032 CUSPARSE_ACTION_NUMERIC,indexBase, 1033 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1034 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1035 #endif 1036 1037 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1038 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1039 upTriFactor->csrMat->values->data().get(), 1040 upTriFactor->csrMat->row_offsets->data().get(), 1041 upTriFactor->csrMat->column_indices->data().get(), 1042 upTriFactorT->csrMat->values->data().get(), 1043 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1044 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1045 CUSPARSE_ACTION_NUMERIC, indexBase, 1046 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1047 #else 1048 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1049 CUSPARSE_ACTION_NUMERIC, indexBase 1050 #endif 1051 );CHKERRCUSPARSE(stat); 1052 1053 /* Create the solve analysis information */ 1054 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1055 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1056 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1057 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1058 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1059 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1060 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1061 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1062 #endif 1063 1064 /* perform the solve analysis */ 1065 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1066 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1067 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1068 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1069 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1070 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1071 #endif 1072 );CHKERRCUSPARSE(stat); 1073 1074 /* assign the pointer. Is this really necessary? */ 1075 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1076 PetscFunctionReturn(0); 1077 } 1078 1079 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 1080 { 1081 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1082 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1083 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1084 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1085 cusparseStatus_t stat; 1086 cusparseIndexBase_t indexBase; 1087 cudaError_t err; 1088 PetscErrorCode ierr; 1089 1090 PetscFunctionBegin; 1091 if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0); 1092 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1093 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1094 /* create cusparse matrix */ 1095 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1096 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1097 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1098 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1099 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1100 1101 /* set alpha and beta */ 1102 err = cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar));CHKERRCUDA(err); 1103 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1104 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1105 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1106 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1107 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1108 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1109 1110 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1111 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1112 CsrMatrix *matrixT= new CsrMatrix; 1113 matrixT->num_rows = A->cmap->n; 1114 matrixT->num_cols = A->rmap->n; 1115 matrixT->num_entries = a->nz; 1116 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1117 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1118 matrixT->values = new THRUSTARRAY(a->nz); 1119 1120 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 1121 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1122 1123 /* compute the transpose, i.e. the CSC */ 1124 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1125 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1126 A->cmap->n, matrix->num_entries, 1127 matrix->values->data().get(), 1128 cusparsestruct->rowoffsets_gpu->data().get(), 1129 matrix->column_indices->data().get(), 1130 matrixT->values->data().get(), 1131 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1132 CUSPARSE_ACTION_NUMERIC,indexBase, 1133 cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1134 err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err); 1135 #endif 1136 1137 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1138 A->cmap->n, matrix->num_entries, 1139 matrix->values->data().get(), 1140 cusparsestruct->rowoffsets_gpu->data().get(), 1141 matrix->column_indices->data().get(), 1142 matrixT->values->data().get(), 1143 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1144 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1145 CUSPARSE_ACTION_NUMERIC,indexBase, 1146 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1147 #else 1148 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1149 CUSPARSE_ACTION_NUMERIC, indexBase 1150 #endif 1151 );CHKERRCUSPARSE(stat); 1152 matstructT->mat = matrixT; 1153 1154 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1155 stat = cusparseCreateCsr(&matstructT->matDescr, 1156 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1157 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1158 matrixT->values->data().get(), 1159 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1160 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1161 #endif 1162 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1163 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1164 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1165 #else 1166 CsrMatrix *temp = new CsrMatrix; 1167 CsrMatrix *tempT = new CsrMatrix; 1168 /* First convert HYB to CSR */ 1169 temp->num_rows = A->rmap->n; 1170 temp->num_cols = A->cmap->n; 1171 temp->num_entries = a->nz; 1172 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1173 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1174 temp->values = new THRUSTARRAY(a->nz); 1175 1176 stat = cusparse_hyb2csr(cusparsestruct->handle, 1177 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1178 temp->values->data().get(), 1179 temp->row_offsets->data().get(), 1180 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1181 1182 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1183 tempT->num_rows = A->rmap->n; 1184 tempT->num_cols = A->cmap->n; 1185 tempT->num_entries = a->nz; 1186 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1187 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1188 tempT->values = new THRUSTARRAY(a->nz); 1189 1190 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1191 temp->num_cols, temp->num_entries, 1192 temp->values->data().get(), 1193 temp->row_offsets->data().get(), 1194 temp->column_indices->data().get(), 1195 tempT->values->data().get(), 1196 tempT->column_indices->data().get(), 1197 tempT->row_offsets->data().get(), 1198 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1199 1200 /* Last, convert CSC to HYB */ 1201 cusparseHybMat_t hybMat; 1202 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1203 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1204 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1205 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1206 matstructT->descr, tempT->values->data().get(), 1207 tempT->row_offsets->data().get(), 1208 tempT->column_indices->data().get(), 1209 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1210 1211 /* assign the pointer */ 1212 matstructT->mat = hybMat; 1213 /* delete temporaries */ 1214 if (tempT) { 1215 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1216 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1217 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1218 delete (CsrMatrix*) tempT; 1219 } 1220 if (temp) { 1221 if (temp->values) delete (THRUSTARRAY*) temp->values; 1222 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1223 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1224 delete (CsrMatrix*) temp; 1225 } 1226 #endif 1227 } 1228 err = WaitForCUDA();CHKERRCUDA(err); 1229 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1230 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1231 /* the compressed row indices is not used for matTranspose */ 1232 matstructT->cprowIndices = NULL; 1233 /* assign the pointer */ 1234 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1235 PetscFunctionReturn(0); 1236 } 1237 1238 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1239 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1240 { 1241 PetscInt n = xx->map->n; 1242 const PetscScalar *barray; 1243 PetscScalar *xarray; 1244 thrust::device_ptr<const PetscScalar> bGPU; 1245 thrust::device_ptr<PetscScalar> xGPU; 1246 cusparseStatus_t stat; 1247 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1248 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1249 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1250 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1251 PetscErrorCode ierr; 1252 cudaError_t cerr; 1253 1254 PetscFunctionBegin; 1255 /* Analyze the matrix and create the transpose ... on the fly */ 1256 if (!loTriFactorT && !upTriFactorT) { 1257 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1258 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1259 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1260 } 1261 1262 /* Get the GPU pointers */ 1263 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1264 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1265 xGPU = thrust::device_pointer_cast(xarray); 1266 bGPU = thrust::device_pointer_cast(barray); 1267 1268 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1269 /* First, reorder with the row permutation */ 1270 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1271 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1272 xGPU); 1273 1274 /* First, solve U */ 1275 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1276 upTriFactorT->csrMat->num_rows, 1277 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1278 upTriFactorT->csrMat->num_entries, 1279 #endif 1280 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1281 upTriFactorT->csrMat->values->data().get(), 1282 upTriFactorT->csrMat->row_offsets->data().get(), 1283 upTriFactorT->csrMat->column_indices->data().get(), 1284 upTriFactorT->solveInfo, 1285 xarray, tempGPU->data().get() 1286 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1287 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1288 #endif 1289 );CHKERRCUSPARSE(stat); 1290 1291 /* Then, solve L */ 1292 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1293 loTriFactorT->csrMat->num_rows, 1294 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1295 loTriFactorT->csrMat->num_entries, 1296 #endif 1297 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1298 loTriFactorT->csrMat->values->data().get(), 1299 loTriFactorT->csrMat->row_offsets->data().get(), 1300 loTriFactorT->csrMat->column_indices->data().get(), 1301 loTriFactorT->solveInfo, 1302 tempGPU->data().get(), xarray 1303 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1304 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1305 #endif 1306 );CHKERRCUSPARSE(stat); 1307 1308 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1309 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1310 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1311 tempGPU->begin()); 1312 1313 /* Copy the temporary to the full solution. */ 1314 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1315 1316 /* restore */ 1317 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1318 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1319 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1320 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1321 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1322 PetscFunctionReturn(0); 1323 } 1324 1325 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1326 { 1327 const PetscScalar *barray; 1328 PetscScalar *xarray; 1329 cusparseStatus_t stat; 1330 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1331 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1332 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1333 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1334 PetscErrorCode ierr; 1335 cudaError_t cerr; 1336 1337 PetscFunctionBegin; 1338 /* Analyze the matrix and create the transpose ... on the fly */ 1339 if (!loTriFactorT && !upTriFactorT) { 1340 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1341 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1342 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1343 } 1344 1345 /* Get the GPU pointers */ 1346 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1347 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1348 1349 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1350 /* First, solve U */ 1351 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1352 upTriFactorT->csrMat->num_rows, 1353 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1354 upTriFactorT->csrMat->num_entries, 1355 #endif 1356 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1357 upTriFactorT->csrMat->values->data().get(), 1358 upTriFactorT->csrMat->row_offsets->data().get(), 1359 upTriFactorT->csrMat->column_indices->data().get(), 1360 upTriFactorT->solveInfo, 1361 barray, tempGPU->data().get() 1362 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1363 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1364 #endif 1365 );CHKERRCUSPARSE(stat); 1366 1367 /* Then, solve L */ 1368 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1369 loTriFactorT->csrMat->num_rows, 1370 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1371 loTriFactorT->csrMat->num_entries, 1372 #endif 1373 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1374 loTriFactorT->csrMat->values->data().get(), 1375 loTriFactorT->csrMat->row_offsets->data().get(), 1376 loTriFactorT->csrMat->column_indices->data().get(), 1377 loTriFactorT->solveInfo, 1378 tempGPU->data().get(), xarray 1379 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1380 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1381 #endif 1382 );CHKERRCUSPARSE(stat); 1383 1384 /* restore */ 1385 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1386 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1387 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1388 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1389 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1390 PetscFunctionReturn(0); 1391 } 1392 1393 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1394 { 1395 const PetscScalar *barray; 1396 PetscScalar *xarray; 1397 thrust::device_ptr<const PetscScalar> bGPU; 1398 thrust::device_ptr<PetscScalar> xGPU; 1399 cusparseStatus_t stat; 1400 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1401 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1402 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1403 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1404 PetscErrorCode ierr; 1405 cudaError_t cerr; 1406 1407 PetscFunctionBegin; 1408 1409 /* Get the GPU pointers */ 1410 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1411 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1412 xGPU = thrust::device_pointer_cast(xarray); 1413 bGPU = thrust::device_pointer_cast(barray); 1414 1415 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1416 /* First, reorder with the row permutation */ 1417 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1418 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1419 tempGPU->begin()); 1420 1421 /* Next, solve L */ 1422 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1423 loTriFactor->csrMat->num_rows, 1424 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1425 loTriFactor->csrMat->num_entries, 1426 #endif 1427 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1428 loTriFactor->csrMat->values->data().get(), 1429 loTriFactor->csrMat->row_offsets->data().get(), 1430 loTriFactor->csrMat->column_indices->data().get(), 1431 loTriFactor->solveInfo, 1432 tempGPU->data().get(), xarray 1433 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1434 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1435 #endif 1436 );CHKERRCUSPARSE(stat); 1437 1438 /* Then, solve U */ 1439 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1440 upTriFactor->csrMat->num_rows, 1441 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1442 upTriFactor->csrMat->num_entries, 1443 #endif 1444 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1445 upTriFactor->csrMat->values->data().get(), 1446 upTriFactor->csrMat->row_offsets->data().get(), 1447 upTriFactor->csrMat->column_indices->data().get(), 1448 upTriFactor->solveInfo, 1449 xarray, tempGPU->data().get() 1450 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1451 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1452 #endif 1453 );CHKERRCUSPARSE(stat); 1454 1455 /* Last, reorder with the column permutation */ 1456 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1457 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1458 xGPU); 1459 1460 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1461 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1462 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1463 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1464 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1465 PetscFunctionReturn(0); 1466 } 1467 1468 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1469 { 1470 const PetscScalar *barray; 1471 PetscScalar *xarray; 1472 cusparseStatus_t stat; 1473 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1474 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1475 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1476 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1477 PetscErrorCode ierr; 1478 cudaError_t cerr; 1479 1480 PetscFunctionBegin; 1481 /* Get the GPU pointers */ 1482 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1483 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1484 1485 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1486 /* First, solve L */ 1487 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1488 loTriFactor->csrMat->num_rows, 1489 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1490 loTriFactor->csrMat->num_entries, 1491 #endif 1492 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1493 loTriFactor->csrMat->values->data().get(), 1494 loTriFactor->csrMat->row_offsets->data().get(), 1495 loTriFactor->csrMat->column_indices->data().get(), 1496 loTriFactor->solveInfo, 1497 barray, tempGPU->data().get() 1498 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1499 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1500 #endif 1501 );CHKERRCUSPARSE(stat); 1502 1503 /* Next, solve U */ 1504 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1505 upTriFactor->csrMat->num_rows, 1506 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1507 upTriFactor->csrMat->num_entries, 1508 #endif 1509 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1510 upTriFactor->csrMat->values->data().get(), 1511 upTriFactor->csrMat->row_offsets->data().get(), 1512 upTriFactor->csrMat->column_indices->data().get(), 1513 upTriFactor->solveInfo, 1514 tempGPU->data().get(), xarray 1515 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1516 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1517 #endif 1518 );CHKERRCUSPARSE(stat); 1519 1520 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1521 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1522 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1523 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1524 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1525 PetscFunctionReturn(0); 1526 } 1527 1528 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1529 { 1530 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1531 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1532 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1533 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1534 PetscErrorCode ierr; 1535 cusparseStatus_t stat; 1536 cudaError_t err; 1537 1538 PetscFunctionBegin; 1539 if (A->boundtocpu) PetscFunctionReturn(0); 1540 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1541 if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1542 /* Copy values only */ 1543 CsrMatrix *matrix,*matrixT; 1544 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1545 1546 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1547 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1548 matrix->values->assign(a->a, a->a+a->nz); 1549 err = WaitForCUDA();CHKERRCUDA(err); 1550 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1551 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1552 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1553 1554 /* Update matT when it was built before */ 1555 if (cusparsestruct->matTranspose) { 1556 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1557 matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1558 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1559 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1560 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1561 A->cmap->n, matrix->num_entries, 1562 matrix->values->data().get(), 1563 cusparsestruct->rowoffsets_gpu->data().get(), 1564 matrix->column_indices->data().get(), 1565 matrixT->values->data().get(), 1566 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1567 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1568 CUSPARSE_ACTION_NUMERIC,indexBase, 1569 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1570 #else 1571 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1572 CUSPARSE_ACTION_NUMERIC, indexBase 1573 #endif 1574 );CHKERRCUSPARSE(stat); 1575 err = WaitForCUDA();CHKERRCUDA(err); 1576 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1577 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1578 } 1579 } else { 1580 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1581 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1582 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1583 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1584 delete cusparsestruct->workVector; 1585 delete cusparsestruct->rowoffsets_gpu; 1586 try { 1587 if (a->compressedrow.use) { 1588 m = a->compressedrow.nrows; 1589 ii = a->compressedrow.i; 1590 ridx = a->compressedrow.rindex; 1591 } else { 1592 m = A->rmap->n; 1593 ii = a->i; 1594 ridx = NULL; 1595 } 1596 cusparsestruct->nrows = m; 1597 1598 /* create cusparse matrix */ 1599 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1600 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1601 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1602 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1603 1604 err = cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar));CHKERRCUDA(err); 1605 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1606 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1607 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1608 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1609 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1610 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1611 1612 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1613 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1614 /* set the matrix */ 1615 CsrMatrix *mat= new CsrMatrix; 1616 mat->num_rows = m; 1617 mat->num_cols = A->cmap->n; 1618 mat->num_entries = a->nz; 1619 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1620 mat->row_offsets->assign(ii, ii + m+1); 1621 1622 mat->column_indices = new THRUSTINTARRAY32(a->nz); 1623 mat->column_indices->assign(a->j, a->j+a->nz); 1624 1625 mat->values = new THRUSTARRAY(a->nz); 1626 mat->values->assign(a->a, a->a+a->nz); 1627 1628 /* assign the pointer */ 1629 matstruct->mat = mat; 1630 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1631 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1632 stat = cusparseCreateCsr(&matstruct->matDescr, 1633 mat->num_rows, mat->num_cols, mat->num_entries, 1634 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1635 mat->values->data().get(), 1636 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1637 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1638 } 1639 #endif 1640 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1641 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1642 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1643 #else 1644 CsrMatrix *mat= new CsrMatrix; 1645 mat->num_rows = m; 1646 mat->num_cols = A->cmap->n; 1647 mat->num_entries = a->nz; 1648 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1649 mat->row_offsets->assign(ii, ii + m+1); 1650 1651 mat->column_indices = new THRUSTINTARRAY32(a->nz); 1652 mat->column_indices->assign(a->j, a->j+a->nz); 1653 1654 mat->values = new THRUSTARRAY(a->nz); 1655 mat->values->assign(a->a, a->a+a->nz); 1656 1657 cusparseHybMat_t hybMat; 1658 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1659 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1660 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1661 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1662 matstruct->descr, mat->values->data().get(), 1663 mat->row_offsets->data().get(), 1664 mat->column_indices->data().get(), 1665 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1666 /* assign the pointer */ 1667 matstruct->mat = hybMat; 1668 1669 if (mat) { 1670 if (mat->values) delete (THRUSTARRAY*)mat->values; 1671 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1672 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1673 delete (CsrMatrix*)mat; 1674 } 1675 #endif 1676 } 1677 1678 /* assign the compressed row indices */ 1679 if (a->compressedrow.use) { 1680 cusparsestruct->workVector = new THRUSTARRAY(m); 1681 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1682 matstruct->cprowIndices->assign(ridx,ridx+m); 1683 tmp = m; 1684 } else { 1685 cusparsestruct->workVector = NULL; 1686 matstruct->cprowIndices = NULL; 1687 tmp = 0; 1688 } 1689 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1690 1691 /* assign the pointer */ 1692 cusparsestruct->mat = matstruct; 1693 } catch(char *ex) { 1694 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1695 } 1696 err = WaitForCUDA();CHKERRCUDA(err); 1697 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1698 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1699 cusparsestruct->nonzerostate = A->nonzerostate; 1700 } 1701 A->offloadmask = PETSC_OFFLOAD_BOTH; 1702 } 1703 PetscFunctionReturn(0); 1704 } 1705 1706 struct VecCUDAPlusEquals 1707 { 1708 template <typename Tuple> 1709 __host__ __device__ 1710 void operator()(Tuple t) 1711 { 1712 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1713 } 1714 }; 1715 1716 struct VecCUDAEqualsReverse 1717 { 1718 template <typename Tuple> 1719 __host__ __device__ 1720 void operator()(Tuple t) 1721 { 1722 thrust::get<0>(t) = thrust::get<1>(t); 1723 } 1724 }; 1725 1726 struct MatMatCusparse { 1727 PetscBool cisdense; 1728 PetscScalar *Bt; 1729 Mat X; 1730 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1731 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1732 cusparseDnMatDescr_t matBDescr; 1733 cusparseDnMatDescr_t matCDescr; 1734 size_t spmmBufferSize; 1735 void *spmmBuffer; 1736 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1737 #endif 1738 }; 1739 1740 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1741 { 1742 PetscErrorCode ierr; 1743 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1744 cudaError_t cerr; 1745 1746 PetscFunctionBegin; 1747 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1748 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1749 cusparseStatus_t stat; 1750 if (mmdata->matBDescr) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);} 1751 if (mmdata->matCDescr) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);} 1752 if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 1753 #endif 1754 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1755 ierr = PetscFree(data);CHKERRQ(ierr); 1756 PetscFunctionReturn(0); 1757 } 1758 1759 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1760 1761 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1762 { 1763 Mat_Product *product = C->product; 1764 Mat A,B; 1765 PetscInt m,n,blda,clda; 1766 PetscBool flg,biscuda; 1767 Mat_SeqAIJCUSPARSE *cusp; 1768 cusparseStatus_t stat; 1769 cusparseOperation_t opA; 1770 const PetscScalar *barray; 1771 PetscScalar *carray; 1772 PetscErrorCode ierr; 1773 MatMatCusparse *mmdata; 1774 Mat_SeqAIJCUSPARSEMultStruct *mat; 1775 CsrMatrix *csrmat; 1776 cudaError_t cerr; 1777 1778 PetscFunctionBegin; 1779 MatCheckProduct(C,1); 1780 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1781 mmdata = (MatMatCusparse*)product->data; 1782 A = product->A; 1783 B = product->B; 1784 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1785 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1786 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1787 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1788 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1789 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1790 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1791 switch (product->type) { 1792 case MATPRODUCT_AB: 1793 case MATPRODUCT_PtAP: 1794 mat = cusp->mat; 1795 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1796 m = A->rmap->n; 1797 n = B->cmap->n; 1798 break; 1799 case MATPRODUCT_AtB: 1800 if (!cusp->transgen) { 1801 mat = cusp->mat; 1802 opA = CUSPARSE_OPERATION_TRANSPOSE; 1803 } else { 1804 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1805 mat = cusp->matTranspose; 1806 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1807 } 1808 m = A->cmap->n; 1809 n = B->cmap->n; 1810 break; 1811 case MATPRODUCT_ABt: 1812 case MATPRODUCT_RARt: 1813 mat = cusp->mat; 1814 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1815 m = A->rmap->n; 1816 n = B->rmap->n; 1817 break; 1818 default: 1819 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1820 } 1821 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1822 csrmat = (CsrMatrix*)mat->mat; 1823 /* if the user passed a CPU matrix, copy the data to the GPU */ 1824 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1825 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 1826 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 1827 1828 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 1829 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1830 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1831 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 1832 } else { 1833 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 1834 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 1835 } 1836 1837 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1838 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1839 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 1840 /* (re)allcoate spmmBuffer if not initialized or LDAs are different */ 1841 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 1842 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 1843 if (!mmdata->matBDescr) { 1844 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 1845 mmdata->Blda = blda; 1846 } 1847 1848 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 1849 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 1850 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 1851 mmdata->Clda = clda; 1852 } 1853 1854 if (!mat->matDescr) { 1855 stat = cusparseCreateCsr(&mat->matDescr, 1856 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 1857 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 1858 csrmat->values->data().get(), 1859 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1860 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1861 } 1862 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 1863 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 1864 mmdata->matCDescr,cusparse_scalartype, 1865 cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat); 1866 if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 1867 cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr); 1868 mmdata->initialized = PETSC_TRUE; 1869 } else { 1870 /* to be safe, always update pointers of the mats */ 1871 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 1872 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 1873 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 1874 } 1875 1876 /* do cusparseSpMM, which supports transpose on B */ 1877 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 1878 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 1879 mmdata->matCDescr,cusparse_scalartype, 1880 cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat); 1881 #else 1882 PetscInt k; 1883 /* cusparseXcsrmm does not support transpose on B */ 1884 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1885 cublasHandle_t cublasv2handle; 1886 cublasStatus_t cerr; 1887 1888 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 1889 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 1890 B->cmap->n,B->rmap->n, 1891 &PETSC_CUSPARSE_ONE ,barray,blda, 1892 &PETSC_CUSPARSE_ZERO,barray,blda, 1893 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 1894 blda = B->cmap->n; 1895 k = B->cmap->n; 1896 } else { 1897 k = B->rmap->n; 1898 } 1899 1900 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 1901 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 1902 csrmat->num_entries,mat->alpha_one,mat->descr, 1903 csrmat->values->data().get(), 1904 csrmat->row_offsets->data().get(), 1905 csrmat->column_indices->data().get(), 1906 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 1907 carray,clda);CHKERRCUSPARSE(stat); 1908 #endif 1909 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1910 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1911 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 1912 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 1913 if (product->type == MATPRODUCT_RARt) { 1914 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1915 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1916 } else if (product->type == MATPRODUCT_PtAP) { 1917 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1918 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1919 } else { 1920 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 1921 } 1922 if (mmdata->cisdense) { 1923 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 1924 } 1925 if (!biscuda) { 1926 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 1927 } 1928 PetscFunctionReturn(0); 1929 } 1930 1931 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1932 { 1933 Mat_Product *product = C->product; 1934 Mat A,B; 1935 PetscInt m,n; 1936 PetscBool cisdense,flg; 1937 PetscErrorCode ierr; 1938 MatMatCusparse *mmdata; 1939 Mat_SeqAIJCUSPARSE *cusp; 1940 1941 PetscFunctionBegin; 1942 MatCheckProduct(C,1); 1943 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 1944 A = product->A; 1945 B = product->B; 1946 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1947 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1948 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1949 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 1950 switch (product->type) { 1951 case MATPRODUCT_AB: 1952 m = A->rmap->n; 1953 n = B->cmap->n; 1954 break; 1955 case MATPRODUCT_AtB: 1956 m = A->cmap->n; 1957 n = B->cmap->n; 1958 break; 1959 case MATPRODUCT_ABt: 1960 m = A->rmap->n; 1961 n = B->rmap->n; 1962 break; 1963 case MATPRODUCT_PtAP: 1964 m = B->cmap->n; 1965 n = B->cmap->n; 1966 break; 1967 case MATPRODUCT_RARt: 1968 m = B->rmap->n; 1969 n = B->rmap->n; 1970 break; 1971 default: 1972 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1973 } 1974 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 1975 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 1976 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 1977 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 1978 1979 /* product data */ 1980 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 1981 mmdata->cisdense = cisdense; 1982 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 1983 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 1984 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1985 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 1986 } 1987 #endif 1988 /* for these products we need intermediate storage */ 1989 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1990 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 1991 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 1992 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 1993 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 1994 } else { 1995 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 1996 } 1997 } 1998 C->product->data = mmdata; 1999 C->product->destroy = MatDestroy_MatMatCusparse; 2000 2001 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2002 PetscFunctionReturn(0); 2003 } 2004 2005 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2006 2007 /* handles dense B */ 2008 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C) 2009 { 2010 Mat_Product *product = C->product; 2011 PetscErrorCode ierr; 2012 2013 PetscFunctionBegin; 2014 MatCheckProduct(C,1); 2015 if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 2016 if (product->A->boundtocpu) { 2017 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr); 2018 PetscFunctionReturn(0); 2019 } 2020 switch (product->type) { 2021 case MATPRODUCT_AB: 2022 case MATPRODUCT_AtB: 2023 case MATPRODUCT_ABt: 2024 case MATPRODUCT_PtAP: 2025 case MATPRODUCT_RARt: 2026 C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2027 default: 2028 break; 2029 } 2030 PetscFunctionReturn(0); 2031 } 2032 2033 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2034 { 2035 PetscErrorCode ierr; 2036 2037 PetscFunctionBegin; 2038 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2039 PetscFunctionReturn(0); 2040 } 2041 2042 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2043 { 2044 PetscErrorCode ierr; 2045 2046 PetscFunctionBegin; 2047 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2048 PetscFunctionReturn(0); 2049 } 2050 2051 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2052 { 2053 PetscErrorCode ierr; 2054 2055 PetscFunctionBegin; 2056 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2057 PetscFunctionReturn(0); 2058 } 2059 2060 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2061 { 2062 PetscErrorCode ierr; 2063 2064 PetscFunctionBegin; 2065 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2066 PetscFunctionReturn(0); 2067 } 2068 2069 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2070 { 2071 PetscErrorCode ierr; 2072 2073 PetscFunctionBegin; 2074 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2075 PetscFunctionReturn(0); 2076 } 2077 2078 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2079 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2080 { 2081 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2082 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2083 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2084 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2085 PetscErrorCode ierr; 2086 cudaError_t cerr; 2087 cusparseStatus_t stat; 2088 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2089 PetscBool compressed; 2090 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2091 PetscInt nx,ny; 2092 #endif 2093 2094 PetscFunctionBegin; 2095 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2096 if (!a->nonzerorowcnt) { 2097 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2098 PetscFunctionReturn(0); 2099 } 2100 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2101 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2102 if (!trans) { 2103 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2104 } else { 2105 if (herm || !cusparsestruct->transgen) { 2106 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2107 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2108 } else { 2109 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);} 2110 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2111 } 2112 } 2113 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2114 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2115 2116 try { 2117 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2118 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2119 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2120 2121 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2122 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2123 /* z = A x + beta y. 2124 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2125 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2126 */ 2127 xptr = xarray; 2128 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2129 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2130 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2131 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2132 allocated to accommodate different uses. So we get the length info directly from mat. 2133 */ 2134 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2135 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2136 nx = mat->num_cols; 2137 ny = mat->num_rows; 2138 } 2139 #endif 2140 } else { 2141 /* z = A^T x + beta y 2142 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2143 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2144 */ 2145 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2146 dptr = zarray; 2147 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2148 if (compressed) { /* Scatter x to work vector */ 2149 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2150 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2151 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2152 VecCUDAEqualsReverse()); 2153 } 2154 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2155 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2156 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2157 nx = mat->num_rows; 2158 ny = mat->num_cols; 2159 } 2160 #endif 2161 } 2162 2163 /* csr_spmv does y = alpha op(A) x + beta y */ 2164 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2165 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2166 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2167 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2168 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2169 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2170 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2171 matstruct->matDescr, 2172 matstruct->cuSpMV[opA].vecXDescr, beta, 2173 matstruct->cuSpMV[opA].vecYDescr, 2174 cusparse_scalartype, 2175 cusparsestruct->spmvAlg, 2176 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2177 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2178 2179 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2180 } else { 2181 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2182 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2183 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2184 } 2185 2186 stat = cusparseSpMV(cusparsestruct->handle, opA, 2187 matstruct->alpha_one, 2188 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */ 2189 matstruct->cuSpMV[opA].vecXDescr, 2190 beta, 2191 matstruct->cuSpMV[opA].vecYDescr, 2192 cusparse_scalartype, 2193 cusparsestruct->spmvAlg, 2194 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2195 #else 2196 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2197 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2198 mat->num_rows, mat->num_cols, 2199 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2200 mat->values->data().get(), mat->row_offsets->data().get(), 2201 mat->column_indices->data().get(), xptr, beta, 2202 dptr);CHKERRCUSPARSE(stat); 2203 #endif 2204 } else { 2205 if (cusparsestruct->nrows) { 2206 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2207 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2208 #else 2209 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2210 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2211 matstruct->alpha_one, matstruct->descr, hybMat, 2212 xptr, beta, 2213 dptr);CHKERRCUSPARSE(stat); 2214 #endif 2215 } 2216 } 2217 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2218 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2219 2220 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2221 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2222 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2223 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2224 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2225 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2226 } 2227 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2228 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2229 } 2230 2231 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2232 if (compressed) { 2233 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2234 2235 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2236 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2237 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2238 VecCUDAPlusEquals()); 2239 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2240 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2241 } 2242 } else { 2243 if (yy && yy != zz) { 2244 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2245 } 2246 } 2247 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2248 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2249 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2250 } catch(char *ex) { 2251 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2252 } 2253 if (yy) { 2254 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2255 } else { 2256 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2257 } 2258 PetscFunctionReturn(0); 2259 } 2260 2261 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2262 { 2263 PetscErrorCode ierr; 2264 2265 PetscFunctionBegin; 2266 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2267 PetscFunctionReturn(0); 2268 } 2269 2270 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2271 { 2272 PetscErrorCode ierr; 2273 2274 PetscFunctionBegin; 2275 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 2276 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2277 if (A->factortype == MAT_FACTOR_NONE) { 2278 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2279 } 2280 PetscFunctionReturn(0); 2281 } 2282 2283 /* --------------------------------------------------------------------------------*/ 2284 /*@ 2285 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2286 (the default parallel PETSc format). This matrix will ultimately pushed down 2287 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2288 assembly performance the user should preallocate the matrix storage by setting 2289 the parameter nz (or the array nnz). By setting these parameters accurately, 2290 performance during matrix assembly can be increased by more than a factor of 50. 2291 2292 Collective 2293 2294 Input Parameters: 2295 + comm - MPI communicator, set to PETSC_COMM_SELF 2296 . m - number of rows 2297 . n - number of columns 2298 . nz - number of nonzeros per row (same for all rows) 2299 - nnz - array containing the number of nonzeros in the various rows 2300 (possibly different for each row) or NULL 2301 2302 Output Parameter: 2303 . A - the matrix 2304 2305 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2306 MatXXXXSetPreallocation() paradgm instead of this routine directly. 2307 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 2308 2309 Notes: 2310 If nnz is given then nz is ignored 2311 2312 The AIJ format (also called the Yale sparse matrix format or 2313 compressed row storage), is fully compatible with standard Fortran 77 2314 storage. That is, the stored row and column indices can begin at 2315 either one (as in Fortran) or zero. See the users' manual for details. 2316 2317 Specify the preallocated storage with either nz or nnz (not both). 2318 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 2319 allocation. For large problems you MUST preallocate memory or you 2320 will get TERRIBLE performance, see the users' manual chapter on matrices. 2321 2322 By default, this format uses inodes (identical nodes) when possible, to 2323 improve numerical efficiency of matrix-vector products and solves. We 2324 search for consecutive rows with the same nonzero structure, thereby 2325 reusing matrix information to achieve increased efficiency. 2326 2327 Level: intermediate 2328 2329 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 2330 @*/ 2331 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 2332 { 2333 PetscErrorCode ierr; 2334 2335 PetscFunctionBegin; 2336 ierr = MatCreate(comm,A);CHKERRQ(ierr); 2337 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 2338 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2339 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 2340 PetscFunctionReturn(0); 2341 } 2342 2343 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 2344 { 2345 PetscErrorCode ierr; 2346 2347 PetscFunctionBegin; 2348 if (A->factortype == MAT_FACTOR_NONE) { 2349 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 2350 } else { 2351 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 2352 } 2353 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 2354 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2355 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 2356 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 2357 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 2358 PetscFunctionReturn(0); 2359 } 2360 2361 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 2362 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 2363 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 2364 { 2365 PetscErrorCode ierr; 2366 2367 PetscFunctionBegin; 2368 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 2369 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 2370 PetscFunctionReturn(0); 2371 } 2372 2373 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 2374 { 2375 PetscErrorCode ierr; 2376 2377 PetscFunctionBegin; 2378 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 2379 /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU. 2380 If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one. 2381 Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case. 2382 TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries; 2383 can follow the example of MatBindToCPU_SeqAIJViennaCL(). */ 2384 if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov"); 2385 /* TODO: add support for this? */ 2386 if (flg) { 2387 A->ops->mult = MatMult_SeqAIJ; 2388 A->ops->multadd = MatMultAdd_SeqAIJ; 2389 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 2390 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 2391 A->ops->multhermitiantranspose = NULL; 2392 A->ops->multhermitiantransposeadd = NULL; 2393 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2394 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 2395 } else { 2396 A->ops->mult = MatMult_SeqAIJCUSPARSE; 2397 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 2398 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 2399 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 2400 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 2401 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 2402 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2403 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2404 } 2405 A->boundtocpu = flg; 2406 PetscFunctionReturn(0); 2407 } 2408 2409 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 2410 { 2411 PetscErrorCode ierr; 2412 cusparseStatus_t stat; 2413 Mat B; 2414 2415 PetscFunctionBegin; 2416 if (reuse == MAT_INITIAL_MATRIX) { 2417 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 2418 } else if (reuse == MAT_REUSE_MATRIX) { 2419 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 2420 } 2421 B = *newmat; 2422 2423 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 2424 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 2425 2426 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 2427 if (B->factortype == MAT_FACTOR_NONE) { 2428 Mat_SeqAIJCUSPARSE *spptr; 2429 2430 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2431 spptr->format = MAT_CUSPARSE_CSR; 2432 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2433 B->spptr = spptr; 2434 } else { 2435 Mat_SeqAIJCUSPARSETriFactors *spptr; 2436 2437 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2438 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2439 B->spptr = spptr; 2440 } 2441 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2442 } 2443 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 2444 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 2445 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 2446 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 2447 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 2448 2449 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 2450 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2451 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 2452 PetscFunctionReturn(0); 2453 } 2454 2455 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 2456 { 2457 PetscErrorCode ierr; 2458 2459 PetscFunctionBegin; 2460 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 2461 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 2462 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2463 ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr); 2464 ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr); 2465 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2466 PetscFunctionReturn(0); 2467 } 2468 2469 /*MC 2470 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 2471 2472 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 2473 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 2474 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 2475 2476 Options Database Keys: 2477 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 2478 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2479 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2480 2481 Level: beginner 2482 2483 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 2484 M*/ 2485 2486 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 2487 2488 2489 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 2490 { 2491 PetscErrorCode ierr; 2492 2493 PetscFunctionBegin; 2494 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2495 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2496 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2497 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2498 PetscFunctionReturn(0); 2499 } 2500 2501 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 2502 { 2503 PetscErrorCode ierr; 2504 cusparseStatus_t stat; 2505 cusparseHandle_t handle; 2506 2507 PetscFunctionBegin; 2508 if (*cusparsestruct) { 2509 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 2510 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 2511 delete (*cusparsestruct)->workVector; 2512 delete (*cusparsestruct)->rowoffsets_gpu; 2513 if (handle = (*cusparsestruct)->handle) {stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);} 2514 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2515 cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr); 2516 #endif 2517 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 2518 } 2519 PetscFunctionReturn(0); 2520 } 2521 2522 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 2523 { 2524 PetscFunctionBegin; 2525 if (*mat) { 2526 delete (*mat)->values; 2527 delete (*mat)->column_indices; 2528 delete (*mat)->row_offsets; 2529 delete *mat; 2530 *mat = 0; 2531 } 2532 PetscFunctionReturn(0); 2533 } 2534 2535 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 2536 { 2537 cusparseStatus_t stat; 2538 PetscErrorCode ierr; 2539 2540 PetscFunctionBegin; 2541 if (*trifactor) { 2542 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 2543 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 2544 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 2545 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2546 cudaError_t cerr; 2547 if ((*trifactor)->solveBuffer) {cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 2548 if ((*trifactor)->csr2cscBuffer) {cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 2549 #endif 2550 delete *trifactor; 2551 *trifactor = 0; 2552 } 2553 PetscFunctionReturn(0); 2554 } 2555 2556 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 2557 { 2558 CsrMatrix *mat; 2559 cusparseStatus_t stat; 2560 cudaError_t err; 2561 2562 PetscFunctionBegin; 2563 if (*matstruct) { 2564 if ((*matstruct)->mat) { 2565 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 2566 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2567 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2568 #else 2569 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 2570 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 2571 #endif 2572 } else { 2573 mat = (CsrMatrix*)(*matstruct)->mat; 2574 CsrMatrix_Destroy(&mat); 2575 } 2576 } 2577 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 2578 delete (*matstruct)->cprowIndices; 2579 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 2580 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 2581 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 2582 2583 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2584 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 2585 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 2586 for (int i=0; i<3; i++) { 2587 if (mdata->cuSpMV[i].initialized) { 2588 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 2589 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 2590 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 2591 } 2592 } 2593 #endif 2594 delete *matstruct; 2595 *matstruct = 0; 2596 } 2597 PetscFunctionReturn(0); 2598 } 2599 2600 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2601 { 2602 PetscErrorCode ierr; 2603 2604 PetscFunctionBegin; 2605 if (*trifactors) { 2606 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 2607 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 2608 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 2609 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 2610 delete (*trifactors)->rpermIndices; 2611 delete (*trifactors)->cpermIndices; 2612 delete (*trifactors)->workVector; 2613 (*trifactors)->rpermIndices = 0; 2614 (*trifactors)->cpermIndices = 0; 2615 (*trifactors)->workVector = 0; 2616 } 2617 PetscFunctionReturn(0); 2618 } 2619 2620 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2621 { 2622 PetscErrorCode ierr; 2623 cusparseHandle_t handle; 2624 cusparseStatus_t stat; 2625 2626 PetscFunctionBegin; 2627 if (*trifactors) { 2628 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 2629 if (handle = (*trifactors)->handle) { 2630 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 2631 } 2632 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 2633 } 2634 PetscFunctionReturn(0); 2635 } 2636