1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 68 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 69 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 74 75 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 76 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 78 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 80 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 81 82 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 83 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 84 85 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 86 { 87 cusparseStatus_t stat; 88 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 89 90 PetscFunctionBegin; 91 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 92 cusparsestruct->stream = stream; 93 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 94 PetscFunctionReturn(0); 95 } 96 97 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 98 { 99 cusparseStatus_t stat; 100 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 101 102 PetscFunctionBegin; 103 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 104 if (cusparsestruct->handle != handle) { 105 if (cusparsestruct->handle) { 106 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 107 } 108 cusparsestruct->handle = handle; 109 } 110 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 111 PetscFunctionReturn(0); 112 } 113 114 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 115 { 116 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 117 PetscBool flg; 118 PetscErrorCode ierr; 119 120 PetscFunctionBegin; 121 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 122 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 123 if (cusparsestruct->handle) cusparsestruct->handle = 0; 124 PetscFunctionReturn(0); 125 } 126 127 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 128 { 129 PetscFunctionBegin; 130 *type = MATSOLVERCUSPARSE; 131 PetscFunctionReturn(0); 132 } 133 134 /*MC 135 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 136 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 137 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 138 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 139 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 140 algorithms are not recommended. This class does NOT support direct solver operations. 141 142 Level: beginner 143 144 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 145 M*/ 146 147 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 148 { 149 PetscErrorCode ierr; 150 PetscInt n = A->rmap->n; 151 152 PetscFunctionBegin; 153 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 154 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 155 (*B)->factortype = ftype; 156 (*B)->useordering = PETSC_TRUE; 157 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 158 159 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 160 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 161 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 162 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 163 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 164 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 165 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 166 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 167 168 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 169 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 170 PetscFunctionReturn(0); 171 } 172 173 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 174 { 175 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 176 177 PetscFunctionBegin; 178 switch (op) { 179 case MAT_CUSPARSE_MULT: 180 cusparsestruct->format = format; 181 break; 182 case MAT_CUSPARSE_ALL: 183 cusparsestruct->format = format; 184 break; 185 default: 186 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 187 } 188 PetscFunctionReturn(0); 189 } 190 191 /*@ 192 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 193 operation. Only the MatMult operation can use different GPU storage formats 194 for MPIAIJCUSPARSE matrices. 195 Not Collective 196 197 Input Parameters: 198 + A - Matrix of type SEQAIJCUSPARSE 199 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 200 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 201 202 Output Parameter: 203 204 Level: intermediate 205 206 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 207 @*/ 208 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 209 { 210 PetscErrorCode ierr; 211 212 PetscFunctionBegin; 213 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 214 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 215 PetscFunctionReturn(0); 216 } 217 218 /*@ 219 MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose 220 221 Collective on mat 222 223 Input Parameters: 224 + A - Matrix of type SEQAIJCUSPARSE 225 - transgen - the boolean flag 226 227 Level: intermediate 228 229 .seealso: MATSEQAIJCUSPARSE 230 @*/ 231 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 232 { 233 PetscErrorCode ierr; 234 PetscBool flg; 235 236 PetscFunctionBegin; 237 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 238 ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 239 if (flg) { 240 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 241 242 if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 243 cusp->transgen = transgen; 244 if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 245 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 246 } 247 } 248 PetscFunctionReturn(0); 249 } 250 251 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 252 { 253 PetscErrorCode ierr; 254 MatCUSPARSEStorageFormat format; 255 PetscBool flg; 256 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 257 258 PetscFunctionBegin; 259 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 260 if (A->factortype == MAT_FACTOR_NONE) { 261 PetscBool transgen = cusparsestruct->transgen; 262 263 ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 264 if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);} 265 266 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 267 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 268 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 269 270 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 271 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 272 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 273 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 274 cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 275 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 276 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 277 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 278 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 279 280 cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 281 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 282 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 283 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 284 285 cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 286 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 287 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 288 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 289 #endif 290 } 291 ierr = PetscOptionsTail();CHKERRQ(ierr); 292 PetscFunctionReturn(0); 293 } 294 295 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 296 { 297 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 298 PetscErrorCode ierr; 299 300 PetscFunctionBegin; 301 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 302 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 303 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 304 PetscFunctionReturn(0); 305 } 306 307 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 308 { 309 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 310 PetscErrorCode ierr; 311 312 PetscFunctionBegin; 313 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 314 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 315 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 316 PetscFunctionReturn(0); 317 } 318 319 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 320 { 321 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 322 PetscErrorCode ierr; 323 324 PetscFunctionBegin; 325 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 326 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 327 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 328 PetscFunctionReturn(0); 329 } 330 331 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 332 { 333 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 334 PetscErrorCode ierr; 335 336 PetscFunctionBegin; 337 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 338 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 339 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 340 PetscFunctionReturn(0); 341 } 342 343 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 344 { 345 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 346 PetscInt n = A->rmap->n; 347 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 348 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 349 cusparseStatus_t stat; 350 const PetscInt *ai = a->i,*aj = a->j,*vi; 351 const MatScalar *aa = a->a,*v; 352 PetscInt *AiLo, *AjLo; 353 PetscInt i,nz, nzLower, offset, rowOffset; 354 PetscErrorCode ierr; 355 cudaError_t cerr; 356 357 PetscFunctionBegin; 358 if (!n) PetscFunctionReturn(0); 359 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 360 try { 361 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 362 nzLower=n+ai[n]-ai[1]; 363 if (!loTriFactor) { 364 PetscScalar *AALo; 365 366 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 367 368 /* Allocate Space for the lower triangular matrix */ 369 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 370 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 371 372 /* Fill the lower triangular matrix */ 373 AiLo[0] = (PetscInt) 0; 374 AiLo[n] = nzLower; 375 AjLo[0] = (PetscInt) 0; 376 AALo[0] = (MatScalar) 1.0; 377 v = aa; 378 vi = aj; 379 offset = 1; 380 rowOffset= 1; 381 for (i=1; i<n; i++) { 382 nz = ai[i+1] - ai[i]; 383 /* additional 1 for the term on the diagonal */ 384 AiLo[i] = rowOffset; 385 rowOffset += nz+1; 386 387 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 388 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 389 390 offset += nz; 391 AjLo[offset] = (PetscInt) i; 392 AALo[offset] = (MatScalar) 1.0; 393 offset += 1; 394 395 v += nz; 396 vi += nz; 397 } 398 399 /* allocate space for the triangular factor information */ 400 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 401 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 402 /* Create the matrix description */ 403 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 404 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 405 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 406 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 407 #else 408 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 409 #endif 410 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 411 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 412 413 /* set the operation */ 414 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 415 416 /* set the matrix */ 417 loTriFactor->csrMat = new CsrMatrix; 418 loTriFactor->csrMat->num_rows = n; 419 loTriFactor->csrMat->num_cols = n; 420 loTriFactor->csrMat->num_entries = nzLower; 421 422 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 423 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 424 425 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 426 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 427 428 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 429 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 430 431 /* Create the solve analysis information */ 432 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 433 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 434 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 435 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 436 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 437 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 438 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 439 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 440 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 441 #endif 442 443 /* perform the solve analysis */ 444 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 445 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 446 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 447 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 448 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 449 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 450 #endif 451 );CHKERRCUSPARSE(stat); 452 cerr = WaitForCUDA();CHKERRCUDA(cerr); 453 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 454 455 /* assign the pointer */ 456 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 457 loTriFactor->AA_h = AALo; 458 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 459 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 460 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 461 } else { /* update values only */ 462 if (!loTriFactor->AA_h) { 463 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 464 } 465 /* Fill the lower triangular matrix */ 466 loTriFactor->AA_h[0] = 1.0; 467 v = aa; 468 vi = aj; 469 offset = 1; 470 for (i=1; i<n; i++) { 471 nz = ai[i+1] - ai[i]; 472 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 473 offset += nz; 474 loTriFactor->AA_h[offset] = 1.0; 475 offset += 1; 476 v += nz; 477 } 478 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 479 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 480 } 481 } catch(char *ex) { 482 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 483 } 484 } 485 PetscFunctionReturn(0); 486 } 487 488 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 489 { 490 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 491 PetscInt n = A->rmap->n; 492 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 493 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 494 cusparseStatus_t stat; 495 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 496 const MatScalar *aa = a->a,*v; 497 PetscInt *AiUp, *AjUp; 498 PetscInt i,nz, nzUpper, offset; 499 PetscErrorCode ierr; 500 cudaError_t cerr; 501 502 PetscFunctionBegin; 503 if (!n) PetscFunctionReturn(0); 504 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 505 try { 506 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 507 nzUpper = adiag[0]-adiag[n]; 508 if (!upTriFactor) { 509 PetscScalar *AAUp; 510 511 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 512 513 /* Allocate Space for the upper triangular matrix */ 514 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 515 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 516 517 /* Fill the upper triangular matrix */ 518 AiUp[0]=(PetscInt) 0; 519 AiUp[n]=nzUpper; 520 offset = nzUpper; 521 for (i=n-1; i>=0; i--) { 522 v = aa + adiag[i+1] + 1; 523 vi = aj + adiag[i+1] + 1; 524 525 /* number of elements NOT on the diagonal */ 526 nz = adiag[i] - adiag[i+1]-1; 527 528 /* decrement the offset */ 529 offset -= (nz+1); 530 531 /* first, set the diagonal elements */ 532 AjUp[offset] = (PetscInt) i; 533 AAUp[offset] = (MatScalar)1./v[nz]; 534 AiUp[i] = AiUp[i+1] - (nz+1); 535 536 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 537 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 538 } 539 540 /* allocate space for the triangular factor information */ 541 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 542 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 543 544 /* Create the matrix description */ 545 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 546 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 547 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 548 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 549 #else 550 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 551 #endif 552 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 553 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 554 555 /* set the operation */ 556 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 557 558 /* set the matrix */ 559 upTriFactor->csrMat = new CsrMatrix; 560 upTriFactor->csrMat->num_rows = n; 561 upTriFactor->csrMat->num_cols = n; 562 upTriFactor->csrMat->num_entries = nzUpper; 563 564 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 565 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 566 567 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 568 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 569 570 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 571 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 572 573 /* Create the solve analysis information */ 574 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 575 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 576 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 577 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 578 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 579 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 580 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 581 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 582 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 583 #endif 584 585 /* perform the solve analysis */ 586 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 587 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 588 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 589 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 590 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 591 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 592 #endif 593 );CHKERRCUSPARSE(stat); 594 cerr = WaitForCUDA();CHKERRCUDA(cerr); 595 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 596 597 /* assign the pointer */ 598 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 599 upTriFactor->AA_h = AAUp; 600 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 601 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 602 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 603 } else { 604 if (!upTriFactor->AA_h) { 605 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 606 } 607 /* Fill the upper triangular matrix */ 608 offset = nzUpper; 609 for (i=n-1; i>=0; i--) { 610 v = aa + adiag[i+1] + 1; 611 612 /* number of elements NOT on the diagonal */ 613 nz = adiag[i] - adiag[i+1]-1; 614 615 /* decrement the offset */ 616 offset -= (nz+1); 617 618 /* first, set the diagonal elements */ 619 upTriFactor->AA_h[offset] = 1./v[nz]; 620 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 621 } 622 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 623 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 624 } 625 } catch(char *ex) { 626 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 627 } 628 } 629 PetscFunctionReturn(0); 630 } 631 632 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 633 { 634 PetscErrorCode ierr; 635 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 636 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 637 IS isrow = a->row,iscol = a->icol; 638 PetscBool row_identity,col_identity; 639 PetscInt n = A->rmap->n; 640 641 PetscFunctionBegin; 642 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 643 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 644 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 645 646 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 647 cusparseTriFactors->nnz=a->nz; 648 649 A->offloadmask = PETSC_OFFLOAD_BOTH; 650 /* lower triangular indices */ 651 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 652 if (!row_identity && !cusparseTriFactors->rpermIndices) { 653 const PetscInt *r; 654 655 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 656 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 657 cusparseTriFactors->rpermIndices->assign(r, r+n); 658 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 659 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 660 } 661 662 /* upper triangular indices */ 663 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 664 if (!col_identity && !cusparseTriFactors->cpermIndices) { 665 const PetscInt *c; 666 667 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 668 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 669 cusparseTriFactors->cpermIndices->assign(c, c+n); 670 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 671 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 672 } 673 PetscFunctionReturn(0); 674 } 675 676 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 677 { 678 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 679 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 680 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 681 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 682 cusparseStatus_t stat; 683 PetscErrorCode ierr; 684 cudaError_t cerr; 685 PetscInt *AiUp, *AjUp; 686 PetscScalar *AAUp; 687 PetscScalar *AALo; 688 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 689 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 690 const PetscInt *ai = b->i,*aj = b->j,*vj; 691 const MatScalar *aa = b->a,*v; 692 693 PetscFunctionBegin; 694 if (!n) PetscFunctionReturn(0); 695 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 696 try { 697 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 698 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 699 if (!upTriFactor && !loTriFactor) { 700 /* Allocate Space for the upper triangular matrix */ 701 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 702 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 703 704 /* Fill the upper triangular matrix */ 705 AiUp[0]=(PetscInt) 0; 706 AiUp[n]=nzUpper; 707 offset = 0; 708 for (i=0; i<n; i++) { 709 /* set the pointers */ 710 v = aa + ai[i]; 711 vj = aj + ai[i]; 712 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 713 714 /* first, set the diagonal elements */ 715 AjUp[offset] = (PetscInt) i; 716 AAUp[offset] = (MatScalar)1.0/v[nz]; 717 AiUp[i] = offset; 718 AALo[offset] = (MatScalar)1.0/v[nz]; 719 720 offset+=1; 721 if (nz>0) { 722 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 723 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 724 for (j=offset; j<offset+nz; j++) { 725 AAUp[j] = -AAUp[j]; 726 AALo[j] = AAUp[j]/v[nz]; 727 } 728 offset+=nz; 729 } 730 } 731 732 /* allocate space for the triangular factor information */ 733 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 734 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 735 736 /* Create the matrix description */ 737 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 738 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 739 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 740 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 741 #else 742 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 743 #endif 744 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 745 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 746 747 /* set the matrix */ 748 upTriFactor->csrMat = new CsrMatrix; 749 upTriFactor->csrMat->num_rows = A->rmap->n; 750 upTriFactor->csrMat->num_cols = A->cmap->n; 751 upTriFactor->csrMat->num_entries = a->nz; 752 753 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 754 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 755 756 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 757 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 758 759 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 760 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 761 762 /* set the operation */ 763 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 764 765 /* Create the solve analysis information */ 766 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 767 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 768 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 769 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 770 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 771 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 772 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 773 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 774 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 775 #endif 776 777 /* perform the solve analysis */ 778 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 779 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 780 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 781 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 782 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 783 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 784 #endif 785 );CHKERRCUSPARSE(stat); 786 cerr = WaitForCUDA();CHKERRCUDA(cerr); 787 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 788 789 /* assign the pointer */ 790 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 791 792 /* allocate space for the triangular factor information */ 793 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 794 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 795 796 /* Create the matrix description */ 797 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 798 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 799 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 800 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 801 #else 802 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 803 #endif 804 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 805 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 806 807 /* set the operation */ 808 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 809 810 /* set the matrix */ 811 loTriFactor->csrMat = new CsrMatrix; 812 loTriFactor->csrMat->num_rows = A->rmap->n; 813 loTriFactor->csrMat->num_cols = A->cmap->n; 814 loTriFactor->csrMat->num_entries = a->nz; 815 816 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 817 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 818 819 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 820 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 821 822 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 823 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 824 825 /* Create the solve analysis information */ 826 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 827 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 828 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 829 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 830 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 831 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 832 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 833 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 834 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 835 #endif 836 837 /* perform the solve analysis */ 838 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 839 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 840 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 841 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 842 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 843 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 844 #endif 845 );CHKERRCUSPARSE(stat); 846 cerr = WaitForCUDA();CHKERRCUDA(cerr); 847 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 848 849 /* assign the pointer */ 850 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 851 852 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 853 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 854 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 855 } else { 856 /* Fill the upper triangular matrix */ 857 offset = 0; 858 for (i=0; i<n; i++) { 859 /* set the pointers */ 860 v = aa + ai[i]; 861 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 862 863 /* first, set the diagonal elements */ 864 AAUp[offset] = 1.0/v[nz]; 865 AALo[offset] = 1.0/v[nz]; 866 867 offset+=1; 868 if (nz>0) { 869 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 870 for (j=offset; j<offset+nz; j++) { 871 AAUp[j] = -AAUp[j]; 872 AALo[j] = AAUp[j]/v[nz]; 873 } 874 offset+=nz; 875 } 876 } 877 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 878 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 879 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 880 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 881 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 882 } 883 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 884 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 885 } catch(char *ex) { 886 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 887 } 888 } 889 PetscFunctionReturn(0); 890 } 891 892 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 893 { 894 PetscErrorCode ierr; 895 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 896 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 897 IS ip = a->row; 898 PetscBool perm_identity; 899 PetscInt n = A->rmap->n; 900 901 PetscFunctionBegin; 902 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 903 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 904 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 905 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 906 907 A->offloadmask = PETSC_OFFLOAD_BOTH; 908 909 /* lower triangular indices */ 910 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 911 if (!perm_identity) { 912 IS iip; 913 const PetscInt *irip,*rip; 914 915 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 916 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 917 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 918 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 919 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 920 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 921 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 922 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 923 ierr = ISDestroy(&iip);CHKERRQ(ierr); 924 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 925 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 926 } 927 PetscFunctionReturn(0); 928 } 929 930 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 931 { 932 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 933 IS isrow = b->row,iscol = b->col; 934 PetscBool row_identity,col_identity; 935 PetscErrorCode ierr; 936 937 PetscFunctionBegin; 938 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 939 B->offloadmask = PETSC_OFFLOAD_CPU; 940 /* determine which version of MatSolve needs to be used. */ 941 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 942 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 943 if (row_identity && col_identity) { 944 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 945 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 946 B->ops->matsolve = NULL; 947 B->ops->matsolvetranspose = NULL; 948 } else { 949 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 950 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 951 B->ops->matsolve = NULL; 952 B->ops->matsolvetranspose = NULL; 953 } 954 955 /* get the triangular factors */ 956 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 957 PetscFunctionReturn(0); 958 } 959 960 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 961 { 962 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 963 IS ip = b->row; 964 PetscBool perm_identity; 965 PetscErrorCode ierr; 966 967 PetscFunctionBegin; 968 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 969 B->offloadmask = PETSC_OFFLOAD_CPU; 970 /* determine which version of MatSolve needs to be used. */ 971 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 972 if (perm_identity) { 973 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 974 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 975 B->ops->matsolve = NULL; 976 B->ops->matsolvetranspose = NULL; 977 } else { 978 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 979 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 980 B->ops->matsolve = NULL; 981 B->ops->matsolvetranspose = NULL; 982 } 983 984 /* get the triangular factors */ 985 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 986 PetscFunctionReturn(0); 987 } 988 989 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 990 { 991 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 992 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 993 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 994 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 995 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 996 cusparseStatus_t stat; 997 cusparseIndexBase_t indexBase; 998 cusparseMatrixType_t matrixType; 999 cusparseFillMode_t fillMode; 1000 cusparseDiagType_t diagType; 1001 cudaError_t cerr; 1002 PetscErrorCode ierr; 1003 1004 PetscFunctionBegin; 1005 /* allocate space for the transpose of the lower triangular factor */ 1006 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1007 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1008 1009 /* set the matrix descriptors of the lower triangular factor */ 1010 matrixType = cusparseGetMatType(loTriFactor->descr); 1011 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1012 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1013 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1014 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1015 1016 /* Create the matrix description */ 1017 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1018 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1019 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1020 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1021 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1022 1023 /* set the operation */ 1024 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1025 1026 /* allocate GPU space for the CSC of the lower triangular factor*/ 1027 loTriFactorT->csrMat = new CsrMatrix; 1028 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1029 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1030 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1031 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1032 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1033 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1034 1035 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1036 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1037 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1038 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1039 loTriFactor->csrMat->values->data().get(), 1040 loTriFactor->csrMat->row_offsets->data().get(), 1041 loTriFactor->csrMat->column_indices->data().get(), 1042 loTriFactorT->csrMat->values->data().get(), 1043 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1044 CUSPARSE_ACTION_NUMERIC,indexBase, 1045 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1046 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1047 #endif 1048 1049 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1050 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1051 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1052 loTriFactor->csrMat->values->data().get(), 1053 loTriFactor->csrMat->row_offsets->data().get(), 1054 loTriFactor->csrMat->column_indices->data().get(), 1055 loTriFactorT->csrMat->values->data().get(), 1056 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1057 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1058 CUSPARSE_ACTION_NUMERIC, indexBase, 1059 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 1060 #else 1061 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1062 CUSPARSE_ACTION_NUMERIC, indexBase 1063 #endif 1064 );CHKERRCUSPARSE(stat); 1065 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1066 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1067 1068 /* Create the solve analysis information */ 1069 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1070 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1071 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1072 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1073 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1074 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1075 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1076 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1077 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1078 #endif 1079 1080 /* perform the solve analysis */ 1081 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1082 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1083 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1084 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 1085 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1086 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1087 #endif 1088 );CHKERRCUSPARSE(stat); 1089 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1090 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1091 1092 /* assign the pointer */ 1093 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1094 1095 /*********************************************/ 1096 /* Now the Transpose of the Upper Tri Factor */ 1097 /*********************************************/ 1098 1099 /* allocate space for the transpose of the upper triangular factor */ 1100 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1101 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1102 1103 /* set the matrix descriptors of the upper triangular factor */ 1104 matrixType = cusparseGetMatType(upTriFactor->descr); 1105 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1106 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1107 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1108 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1109 1110 /* Create the matrix description */ 1111 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1112 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1113 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1114 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1115 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1116 1117 /* set the operation */ 1118 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1119 1120 /* allocate GPU space for the CSC of the upper triangular factor*/ 1121 upTriFactorT->csrMat = new CsrMatrix; 1122 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1123 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1124 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1125 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1126 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1127 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1128 1129 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1130 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1131 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1132 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1133 upTriFactor->csrMat->values->data().get(), 1134 upTriFactor->csrMat->row_offsets->data().get(), 1135 upTriFactor->csrMat->column_indices->data().get(), 1136 upTriFactorT->csrMat->values->data().get(), 1137 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1138 CUSPARSE_ACTION_NUMERIC,indexBase, 1139 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1140 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1141 #endif 1142 1143 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1144 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1145 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1146 upTriFactor->csrMat->values->data().get(), 1147 upTriFactor->csrMat->row_offsets->data().get(), 1148 upTriFactor->csrMat->column_indices->data().get(), 1149 upTriFactorT->csrMat->values->data().get(), 1150 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1151 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1152 CUSPARSE_ACTION_NUMERIC, indexBase, 1153 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1154 #else 1155 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1156 CUSPARSE_ACTION_NUMERIC, indexBase 1157 #endif 1158 );CHKERRCUSPARSE(stat); 1159 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1160 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1161 1162 /* Create the solve analysis information */ 1163 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1164 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1165 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1166 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1167 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1168 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1169 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1170 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1171 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1172 #endif 1173 1174 /* perform the solve analysis */ 1175 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1176 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1177 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1178 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1179 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1180 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1181 #endif 1182 );CHKERRCUSPARSE(stat); 1183 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1184 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1185 1186 /* assign the pointer */ 1187 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1188 PetscFunctionReturn(0); 1189 } 1190 1191 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 1192 { 1193 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1194 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1195 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1196 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1197 cusparseStatus_t stat; 1198 cusparseIndexBase_t indexBase; 1199 cudaError_t err; 1200 PetscErrorCode ierr; 1201 1202 PetscFunctionBegin; 1203 if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0); 1204 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1205 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1206 /* create cusparse matrix */ 1207 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1208 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1209 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1210 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1211 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1212 1213 /* set alpha and beta */ 1214 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1215 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1216 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1217 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1218 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1219 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1220 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1221 1222 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1223 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1224 CsrMatrix *matrixT= new CsrMatrix; 1225 matrixT->num_rows = A->cmap->n; 1226 matrixT->num_cols = A->rmap->n; 1227 matrixT->num_entries = a->nz; 1228 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1229 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1230 matrixT->values = new THRUSTARRAY(a->nz); 1231 1232 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 1233 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1234 1235 /* compute the transpose, i.e. the CSC */ 1236 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1237 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1238 A->cmap->n, matrix->num_entries, 1239 matrix->values->data().get(), 1240 cusparsestruct->rowoffsets_gpu->data().get(), 1241 matrix->column_indices->data().get(), 1242 matrixT->values->data().get(), 1243 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1244 CUSPARSE_ACTION_NUMERIC,indexBase, 1245 cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1246 err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err); 1247 #endif 1248 1249 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1250 A->cmap->n, matrix->num_entries, 1251 matrix->values->data().get(), 1252 cusparsestruct->rowoffsets_gpu->data().get(), 1253 matrix->column_indices->data().get(), 1254 matrixT->values->data().get(), 1255 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1256 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1257 CUSPARSE_ACTION_NUMERIC,indexBase, 1258 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1259 #else 1260 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1261 CUSPARSE_ACTION_NUMERIC, indexBase 1262 #endif 1263 );CHKERRCUSPARSE(stat); 1264 matstructT->mat = matrixT; 1265 1266 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1267 stat = cusparseCreateCsr(&matstructT->matDescr, 1268 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1269 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1270 matrixT->values->data().get(), 1271 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1272 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1273 #endif 1274 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1275 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1276 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1277 #else 1278 CsrMatrix *temp = new CsrMatrix; 1279 CsrMatrix *tempT = new CsrMatrix; 1280 /* First convert HYB to CSR */ 1281 temp->num_rows = A->rmap->n; 1282 temp->num_cols = A->cmap->n; 1283 temp->num_entries = a->nz; 1284 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1285 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1286 temp->values = new THRUSTARRAY(a->nz); 1287 1288 stat = cusparse_hyb2csr(cusparsestruct->handle, 1289 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1290 temp->values->data().get(), 1291 temp->row_offsets->data().get(), 1292 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1293 1294 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1295 tempT->num_rows = A->rmap->n; 1296 tempT->num_cols = A->cmap->n; 1297 tempT->num_entries = a->nz; 1298 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1299 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1300 tempT->values = new THRUSTARRAY(a->nz); 1301 1302 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1303 temp->num_cols, temp->num_entries, 1304 temp->values->data().get(), 1305 temp->row_offsets->data().get(), 1306 temp->column_indices->data().get(), 1307 tempT->values->data().get(), 1308 tempT->column_indices->data().get(), 1309 tempT->row_offsets->data().get(), 1310 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1311 1312 /* Last, convert CSC to HYB */ 1313 cusparseHybMat_t hybMat; 1314 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1315 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1316 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1317 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1318 matstructT->descr, tempT->values->data().get(), 1319 tempT->row_offsets->data().get(), 1320 tempT->column_indices->data().get(), 1321 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1322 1323 /* assign the pointer */ 1324 matstructT->mat = hybMat; 1325 /* delete temporaries */ 1326 if (tempT) { 1327 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1328 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1329 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1330 delete (CsrMatrix*) tempT; 1331 } 1332 if (temp) { 1333 if (temp->values) delete (THRUSTARRAY*) temp->values; 1334 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1335 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1336 delete (CsrMatrix*) temp; 1337 } 1338 #endif 1339 } 1340 err = WaitForCUDA();CHKERRCUDA(err); 1341 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1342 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1343 /* the compressed row indices is not used for matTranspose */ 1344 matstructT->cprowIndices = NULL; 1345 /* assign the pointer */ 1346 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1347 PetscFunctionReturn(0); 1348 } 1349 1350 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1351 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1352 { 1353 PetscInt n = xx->map->n; 1354 const PetscScalar *barray; 1355 PetscScalar *xarray; 1356 thrust::device_ptr<const PetscScalar> bGPU; 1357 thrust::device_ptr<PetscScalar> xGPU; 1358 cusparseStatus_t stat; 1359 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1360 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1361 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1362 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1363 PetscErrorCode ierr; 1364 cudaError_t cerr; 1365 1366 PetscFunctionBegin; 1367 /* Analyze the matrix and create the transpose ... on the fly */ 1368 if (!loTriFactorT && !upTriFactorT) { 1369 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1370 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1371 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1372 } 1373 1374 /* Get the GPU pointers */ 1375 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1376 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1377 xGPU = thrust::device_pointer_cast(xarray); 1378 bGPU = thrust::device_pointer_cast(barray); 1379 1380 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1381 /* First, reorder with the row permutation */ 1382 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1383 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1384 xGPU); 1385 1386 /* First, solve U */ 1387 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1388 upTriFactorT->csrMat->num_rows, 1389 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1390 upTriFactorT->csrMat->num_entries, 1391 #endif 1392 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1393 upTriFactorT->csrMat->values->data().get(), 1394 upTriFactorT->csrMat->row_offsets->data().get(), 1395 upTriFactorT->csrMat->column_indices->data().get(), 1396 upTriFactorT->solveInfo, 1397 xarray, tempGPU->data().get() 1398 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1399 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1400 #endif 1401 );CHKERRCUSPARSE(stat); 1402 1403 /* Then, solve L */ 1404 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1405 loTriFactorT->csrMat->num_rows, 1406 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1407 loTriFactorT->csrMat->num_entries, 1408 #endif 1409 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1410 loTriFactorT->csrMat->values->data().get(), 1411 loTriFactorT->csrMat->row_offsets->data().get(), 1412 loTriFactorT->csrMat->column_indices->data().get(), 1413 loTriFactorT->solveInfo, 1414 tempGPU->data().get(), xarray 1415 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1416 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1417 #endif 1418 );CHKERRCUSPARSE(stat); 1419 1420 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1421 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1422 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1423 tempGPU->begin()); 1424 1425 /* Copy the temporary to the full solution. */ 1426 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1427 1428 /* restore */ 1429 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1430 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1431 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1432 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1433 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1434 PetscFunctionReturn(0); 1435 } 1436 1437 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1438 { 1439 const PetscScalar *barray; 1440 PetscScalar *xarray; 1441 cusparseStatus_t stat; 1442 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1443 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1444 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1445 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1446 PetscErrorCode ierr; 1447 cudaError_t cerr; 1448 1449 PetscFunctionBegin; 1450 /* Analyze the matrix and create the transpose ... on the fly */ 1451 if (!loTriFactorT && !upTriFactorT) { 1452 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1453 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1454 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1455 } 1456 1457 /* Get the GPU pointers */ 1458 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1459 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1460 1461 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1462 /* First, solve U */ 1463 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1464 upTriFactorT->csrMat->num_rows, 1465 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1466 upTriFactorT->csrMat->num_entries, 1467 #endif 1468 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1469 upTriFactorT->csrMat->values->data().get(), 1470 upTriFactorT->csrMat->row_offsets->data().get(), 1471 upTriFactorT->csrMat->column_indices->data().get(), 1472 upTriFactorT->solveInfo, 1473 barray, tempGPU->data().get() 1474 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1475 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1476 #endif 1477 );CHKERRCUSPARSE(stat); 1478 1479 /* Then, solve L */ 1480 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1481 loTriFactorT->csrMat->num_rows, 1482 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1483 loTriFactorT->csrMat->num_entries, 1484 #endif 1485 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1486 loTriFactorT->csrMat->values->data().get(), 1487 loTriFactorT->csrMat->row_offsets->data().get(), 1488 loTriFactorT->csrMat->column_indices->data().get(), 1489 loTriFactorT->solveInfo, 1490 tempGPU->data().get(), xarray 1491 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1492 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1493 #endif 1494 );CHKERRCUSPARSE(stat); 1495 1496 /* restore */ 1497 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1498 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1499 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1500 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1501 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1502 PetscFunctionReturn(0); 1503 } 1504 1505 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1506 { 1507 const PetscScalar *barray; 1508 PetscScalar *xarray; 1509 thrust::device_ptr<const PetscScalar> bGPU; 1510 thrust::device_ptr<PetscScalar> xGPU; 1511 cusparseStatus_t stat; 1512 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1513 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1514 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1515 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1516 PetscErrorCode ierr; 1517 cudaError_t cerr; 1518 1519 PetscFunctionBegin; 1520 1521 /* Get the GPU pointers */ 1522 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1523 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1524 xGPU = thrust::device_pointer_cast(xarray); 1525 bGPU = thrust::device_pointer_cast(barray); 1526 1527 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1528 /* First, reorder with the row permutation */ 1529 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1530 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1531 tempGPU->begin()); 1532 1533 /* Next, solve L */ 1534 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1535 loTriFactor->csrMat->num_rows, 1536 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1537 loTriFactor->csrMat->num_entries, 1538 #endif 1539 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1540 loTriFactor->csrMat->values->data().get(), 1541 loTriFactor->csrMat->row_offsets->data().get(), 1542 loTriFactor->csrMat->column_indices->data().get(), 1543 loTriFactor->solveInfo, 1544 tempGPU->data().get(), xarray 1545 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1546 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1547 #endif 1548 );CHKERRCUSPARSE(stat); 1549 1550 /* Then, solve U */ 1551 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1552 upTriFactor->csrMat->num_rows, 1553 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1554 upTriFactor->csrMat->num_entries, 1555 #endif 1556 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1557 upTriFactor->csrMat->values->data().get(), 1558 upTriFactor->csrMat->row_offsets->data().get(), 1559 upTriFactor->csrMat->column_indices->data().get(), 1560 upTriFactor->solveInfo, 1561 xarray, tempGPU->data().get() 1562 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1563 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1564 #endif 1565 );CHKERRCUSPARSE(stat); 1566 1567 /* Last, reorder with the column permutation */ 1568 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1569 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1570 xGPU); 1571 1572 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1573 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1574 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1575 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1576 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1577 PetscFunctionReturn(0); 1578 } 1579 1580 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1581 { 1582 const PetscScalar *barray; 1583 PetscScalar *xarray; 1584 cusparseStatus_t stat; 1585 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1586 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1587 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1588 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1589 PetscErrorCode ierr; 1590 cudaError_t cerr; 1591 1592 PetscFunctionBegin; 1593 /* Get the GPU pointers */ 1594 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1595 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1596 1597 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1598 /* First, solve L */ 1599 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1600 loTriFactor->csrMat->num_rows, 1601 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1602 loTriFactor->csrMat->num_entries, 1603 #endif 1604 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1605 loTriFactor->csrMat->values->data().get(), 1606 loTriFactor->csrMat->row_offsets->data().get(), 1607 loTriFactor->csrMat->column_indices->data().get(), 1608 loTriFactor->solveInfo, 1609 barray, tempGPU->data().get() 1610 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1611 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1612 #endif 1613 );CHKERRCUSPARSE(stat); 1614 1615 /* Next, solve U */ 1616 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1617 upTriFactor->csrMat->num_rows, 1618 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1619 upTriFactor->csrMat->num_entries, 1620 #endif 1621 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1622 upTriFactor->csrMat->values->data().get(), 1623 upTriFactor->csrMat->row_offsets->data().get(), 1624 upTriFactor->csrMat->column_indices->data().get(), 1625 upTriFactor->solveInfo, 1626 tempGPU->data().get(), xarray 1627 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1628 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1629 #endif 1630 );CHKERRCUSPARSE(stat); 1631 1632 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1633 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1634 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1635 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1636 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1637 PetscFunctionReturn(0); 1638 } 1639 1640 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1641 { 1642 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1643 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1644 cudaError_t cerr; 1645 PetscErrorCode ierr; 1646 1647 PetscFunctionBegin; 1648 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1649 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1650 1651 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1652 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1653 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1654 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1655 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1656 A->offloadmask = PETSC_OFFLOAD_BOTH; 1657 } 1658 PetscFunctionReturn(0); 1659 } 1660 1661 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1662 { 1663 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1664 PetscErrorCode ierr; 1665 1666 PetscFunctionBegin; 1667 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1668 *array = a->a; 1669 A->offloadmask = PETSC_OFFLOAD_CPU; 1670 PetscFunctionReturn(0); 1671 } 1672 1673 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1674 { 1675 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1676 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1677 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1678 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1679 PetscErrorCode ierr; 1680 cusparseStatus_t stat; 1681 cudaError_t err; 1682 1683 PetscFunctionBegin; 1684 if (A->boundtocpu) PetscFunctionReturn(0); 1685 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1686 if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1687 /* Copy values only */ 1688 CsrMatrix *matrix,*matrixT; 1689 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1690 1691 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1692 matrix->values->assign(a->a, a->a+a->nz); 1693 err = WaitForCUDA();CHKERRCUDA(err); 1694 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1695 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1696 1697 /* Update matT when it was built before */ 1698 if (cusparsestruct->matTranspose) { 1699 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1700 matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1701 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1702 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1703 A->cmap->n, matrix->num_entries, 1704 matrix->values->data().get(), 1705 cusparsestruct->rowoffsets_gpu->data().get(), 1706 matrix->column_indices->data().get(), 1707 matrixT->values->data().get(), 1708 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1709 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1710 CUSPARSE_ACTION_NUMERIC,indexBase, 1711 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1712 #else 1713 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1714 CUSPARSE_ACTION_NUMERIC, indexBase 1715 #endif 1716 );CHKERRCUSPARSE(stat); 1717 err = WaitForCUDA();CHKERRCUDA(err); 1718 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1719 } 1720 } else { 1721 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1722 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1723 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1724 delete cusparsestruct->workVector; 1725 delete cusparsestruct->rowoffsets_gpu; 1726 try { 1727 if (a->compressedrow.use) { 1728 m = a->compressedrow.nrows; 1729 ii = a->compressedrow.i; 1730 ridx = a->compressedrow.rindex; 1731 } else { 1732 m = A->rmap->n; 1733 ii = a->i; 1734 ridx = NULL; 1735 } 1736 cusparsestruct->nrows = m; 1737 1738 /* create cusparse matrix */ 1739 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1740 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1741 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1742 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1743 1744 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1745 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1746 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1747 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1748 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1749 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1750 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1751 1752 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1753 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1754 /* set the matrix */ 1755 CsrMatrix *mat= new CsrMatrix; 1756 mat->num_rows = m; 1757 mat->num_cols = A->cmap->n; 1758 mat->num_entries = a->nz; 1759 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1760 mat->row_offsets->assign(ii, ii + m+1); 1761 1762 mat->column_indices = new THRUSTINTARRAY32(a->nz); 1763 mat->column_indices->assign(a->j, a->j+a->nz); 1764 1765 mat->values = new THRUSTARRAY(a->nz); 1766 mat->values->assign(a->a, a->a+a->nz); 1767 1768 /* assign the pointer */ 1769 matstruct->mat = mat; 1770 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1771 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1772 stat = cusparseCreateCsr(&matstruct->matDescr, 1773 mat->num_rows, mat->num_cols, mat->num_entries, 1774 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1775 mat->values->data().get(), 1776 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1777 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1778 } 1779 #endif 1780 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1781 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1782 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1783 #else 1784 CsrMatrix *mat= new CsrMatrix; 1785 mat->num_rows = m; 1786 mat->num_cols = A->cmap->n; 1787 mat->num_entries = a->nz; 1788 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1789 mat->row_offsets->assign(ii, ii + m+1); 1790 1791 mat->column_indices = new THRUSTINTARRAY32(a->nz); 1792 mat->column_indices->assign(a->j, a->j+a->nz); 1793 1794 mat->values = new THRUSTARRAY(a->nz); 1795 mat->values->assign(a->a, a->a+a->nz); 1796 1797 cusparseHybMat_t hybMat; 1798 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1799 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1800 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1801 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1802 matstruct->descr, mat->values->data().get(), 1803 mat->row_offsets->data().get(), 1804 mat->column_indices->data().get(), 1805 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1806 /* assign the pointer */ 1807 matstruct->mat = hybMat; 1808 1809 if (mat) { 1810 if (mat->values) delete (THRUSTARRAY*)mat->values; 1811 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1812 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1813 delete (CsrMatrix*)mat; 1814 } 1815 #endif 1816 } 1817 1818 /* assign the compressed row indices */ 1819 if (a->compressedrow.use) { 1820 cusparsestruct->workVector = new THRUSTARRAY(m); 1821 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1822 matstruct->cprowIndices->assign(ridx,ridx+m); 1823 tmp = m; 1824 } else { 1825 cusparsestruct->workVector = NULL; 1826 matstruct->cprowIndices = NULL; 1827 tmp = 0; 1828 } 1829 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1830 1831 /* assign the pointer */ 1832 cusparsestruct->mat = matstruct; 1833 } catch(char *ex) { 1834 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1835 } 1836 err = WaitForCUDA();CHKERRCUDA(err); 1837 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1838 cusparsestruct->nonzerostate = A->nonzerostate; 1839 } 1840 A->offloadmask = PETSC_OFFLOAD_BOTH; 1841 } 1842 PetscFunctionReturn(0); 1843 } 1844 1845 struct VecCUDAPlusEquals 1846 { 1847 template <typename Tuple> 1848 __host__ __device__ 1849 void operator()(Tuple t) 1850 { 1851 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1852 } 1853 }; 1854 1855 struct VecCUDAEquals 1856 { 1857 template <typename Tuple> 1858 __host__ __device__ 1859 void operator()(Tuple t) 1860 { 1861 thrust::get<1>(t) = thrust::get<0>(t); 1862 } 1863 }; 1864 1865 struct VecCUDAEqualsReverse 1866 { 1867 template <typename Tuple> 1868 __host__ __device__ 1869 void operator()(Tuple t) 1870 { 1871 thrust::get<0>(t) = thrust::get<1>(t); 1872 } 1873 }; 1874 1875 struct MatMatCusparse { 1876 PetscBool cisdense; 1877 PetscScalar *Bt; 1878 Mat X; 1879 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1880 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1881 cusparseDnMatDescr_t matBDescr; 1882 cusparseDnMatDescr_t matCDescr; 1883 size_t spmmBufferSize; 1884 void *spmmBuffer; 1885 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1886 #endif 1887 }; 1888 1889 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1890 { 1891 PetscErrorCode ierr; 1892 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1893 cudaError_t cerr; 1894 1895 PetscFunctionBegin; 1896 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1897 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1898 cusparseStatus_t stat; 1899 if (mmdata->matBDescr) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);} 1900 if (mmdata->matCDescr) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);} 1901 if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 1902 #endif 1903 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1904 ierr = PetscFree(data);CHKERRQ(ierr); 1905 PetscFunctionReturn(0); 1906 } 1907 1908 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1909 1910 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1911 { 1912 Mat_Product *product = C->product; 1913 Mat A,B; 1914 PetscInt m,n,blda,clda; 1915 PetscBool flg,biscuda; 1916 Mat_SeqAIJCUSPARSE *cusp; 1917 cusparseStatus_t stat; 1918 cusparseOperation_t opA; 1919 const PetscScalar *barray; 1920 PetscScalar *carray; 1921 PetscErrorCode ierr; 1922 MatMatCusparse *mmdata; 1923 Mat_SeqAIJCUSPARSEMultStruct *mat; 1924 CsrMatrix *csrmat; 1925 cudaError_t cerr; 1926 1927 PetscFunctionBegin; 1928 MatCheckProduct(C,1); 1929 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1930 mmdata = (MatMatCusparse*)product->data; 1931 A = product->A; 1932 B = product->B; 1933 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1934 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1935 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1936 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1937 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1938 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1939 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1940 switch (product->type) { 1941 case MATPRODUCT_AB: 1942 case MATPRODUCT_PtAP: 1943 mat = cusp->mat; 1944 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1945 m = A->rmap->n; 1946 n = B->cmap->n; 1947 break; 1948 case MATPRODUCT_AtB: 1949 if (!cusp->transgen) { 1950 mat = cusp->mat; 1951 opA = CUSPARSE_OPERATION_TRANSPOSE; 1952 } else { 1953 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1954 mat = cusp->matTranspose; 1955 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1956 } 1957 m = A->cmap->n; 1958 n = B->cmap->n; 1959 break; 1960 case MATPRODUCT_ABt: 1961 case MATPRODUCT_RARt: 1962 mat = cusp->mat; 1963 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1964 m = A->rmap->n; 1965 n = B->rmap->n; 1966 break; 1967 default: 1968 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1969 } 1970 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1971 csrmat = (CsrMatrix*)mat->mat; 1972 /* if the user passed a CPU matrix, copy the data to the GPU */ 1973 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1974 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 1975 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 1976 1977 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 1978 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1979 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1980 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 1981 } else { 1982 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 1983 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 1984 } 1985 1986 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1987 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1988 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 1989 /* (re)allcoate spmmBuffer if not initialized or LDAs are different */ 1990 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 1991 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 1992 if (!mmdata->matBDescr) { 1993 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 1994 mmdata->Blda = blda; 1995 } 1996 1997 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 1998 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 1999 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2000 mmdata->Clda = clda; 2001 } 2002 2003 if (!mat->matDescr) { 2004 stat = cusparseCreateCsr(&mat->matDescr, 2005 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2006 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2007 csrmat->values->data().get(), 2008 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2009 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2010 } 2011 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2012 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2013 mmdata->matCDescr,cusparse_scalartype, 2014 cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat); 2015 if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 2016 cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr); 2017 mmdata->initialized = PETSC_TRUE; 2018 } else { 2019 /* to be safe, always update pointers of the mats */ 2020 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2021 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2022 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2023 } 2024 2025 /* do cusparseSpMM, which supports transpose on B */ 2026 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2027 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2028 mmdata->matCDescr,cusparse_scalartype, 2029 cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat); 2030 #else 2031 PetscInt k; 2032 /* cusparseXcsrmm does not support transpose on B */ 2033 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2034 cublasHandle_t cublasv2handle; 2035 cublasStatus_t cerr; 2036 2037 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2038 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2039 B->cmap->n,B->rmap->n, 2040 &PETSC_CUSPARSE_ONE ,barray,blda, 2041 &PETSC_CUSPARSE_ZERO,barray,blda, 2042 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2043 blda = B->cmap->n; 2044 k = B->cmap->n; 2045 } else { 2046 k = B->rmap->n; 2047 } 2048 2049 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2050 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2051 csrmat->num_entries,mat->alpha_one,mat->descr, 2052 csrmat->values->data().get(), 2053 csrmat->row_offsets->data().get(), 2054 csrmat->column_indices->data().get(), 2055 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2056 carray,clda);CHKERRCUSPARSE(stat); 2057 #endif 2058 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2059 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2060 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2061 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2062 if (product->type == MATPRODUCT_RARt) { 2063 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2064 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2065 } else if (product->type == MATPRODUCT_PtAP) { 2066 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2067 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2068 } else { 2069 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2070 } 2071 if (mmdata->cisdense) { 2072 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2073 } 2074 if (!biscuda) { 2075 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2076 } 2077 PetscFunctionReturn(0); 2078 } 2079 2080 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2081 { 2082 Mat_Product *product = C->product; 2083 Mat A,B; 2084 PetscInt m,n; 2085 PetscBool cisdense,flg; 2086 PetscErrorCode ierr; 2087 MatMatCusparse *mmdata; 2088 Mat_SeqAIJCUSPARSE *cusp; 2089 2090 PetscFunctionBegin; 2091 MatCheckProduct(C,1); 2092 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2093 A = product->A; 2094 B = product->B; 2095 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2096 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2097 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2098 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2099 switch (product->type) { 2100 case MATPRODUCT_AB: 2101 m = A->rmap->n; 2102 n = B->cmap->n; 2103 break; 2104 case MATPRODUCT_AtB: 2105 m = A->cmap->n; 2106 n = B->cmap->n; 2107 break; 2108 case MATPRODUCT_ABt: 2109 m = A->rmap->n; 2110 n = B->rmap->n; 2111 break; 2112 case MATPRODUCT_PtAP: 2113 m = B->cmap->n; 2114 n = B->cmap->n; 2115 break; 2116 case MATPRODUCT_RARt: 2117 m = B->rmap->n; 2118 n = B->rmap->n; 2119 break; 2120 default: 2121 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2122 } 2123 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2124 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2125 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2126 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2127 2128 /* product data */ 2129 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2130 mmdata->cisdense = cisdense; 2131 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2132 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2133 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2134 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2135 } 2136 #endif 2137 /* for these products we need intermediate storage */ 2138 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2139 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2140 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2141 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2142 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2143 } else { 2144 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2145 } 2146 } 2147 C->product->data = mmdata; 2148 C->product->destroy = MatDestroy_MatMatCusparse; 2149 2150 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2151 PetscFunctionReturn(0); 2152 } 2153 2154 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2155 2156 /* handles dense B */ 2157 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C) 2158 { 2159 Mat_Product *product = C->product; 2160 PetscErrorCode ierr; 2161 2162 PetscFunctionBegin; 2163 MatCheckProduct(C,1); 2164 if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 2165 if (product->A->boundtocpu) { 2166 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr); 2167 PetscFunctionReturn(0); 2168 } 2169 switch (product->type) { 2170 case MATPRODUCT_AB: 2171 case MATPRODUCT_AtB: 2172 case MATPRODUCT_ABt: 2173 case MATPRODUCT_PtAP: 2174 case MATPRODUCT_RARt: 2175 C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2176 default: 2177 break; 2178 } 2179 PetscFunctionReturn(0); 2180 } 2181 2182 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2183 { 2184 PetscErrorCode ierr; 2185 2186 PetscFunctionBegin; 2187 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2188 PetscFunctionReturn(0); 2189 } 2190 2191 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2192 { 2193 PetscErrorCode ierr; 2194 2195 PetscFunctionBegin; 2196 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2197 PetscFunctionReturn(0); 2198 } 2199 2200 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2201 { 2202 PetscErrorCode ierr; 2203 2204 PetscFunctionBegin; 2205 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2206 PetscFunctionReturn(0); 2207 } 2208 2209 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2210 { 2211 PetscErrorCode ierr; 2212 2213 PetscFunctionBegin; 2214 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2215 PetscFunctionReturn(0); 2216 } 2217 2218 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2219 { 2220 PetscErrorCode ierr; 2221 2222 PetscFunctionBegin; 2223 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2224 PetscFunctionReturn(0); 2225 } 2226 2227 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2228 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2229 { 2230 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2231 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2232 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2233 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2234 PetscErrorCode ierr; 2235 cudaError_t cerr; 2236 cusparseStatus_t stat; 2237 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2238 PetscBool compressed; 2239 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2240 PetscInt nx,ny; 2241 #endif 2242 2243 PetscFunctionBegin; 2244 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2245 if (!a->nonzerorowcnt) { 2246 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2247 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2248 PetscFunctionReturn(0); 2249 } 2250 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2251 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2252 if (!trans) { 2253 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2254 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2255 } else { 2256 if (herm || !cusparsestruct->transgen) { 2257 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2258 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2259 } else { 2260 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);} 2261 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2262 } 2263 } 2264 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2265 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2266 2267 try { 2268 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2269 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2270 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2271 2272 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2273 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2274 /* z = A x + beta y. 2275 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2276 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2277 */ 2278 xptr = xarray; 2279 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2280 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2281 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2282 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2283 allocated to accommodate different uses. So we get the length info directly from mat. 2284 */ 2285 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2286 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2287 nx = mat->num_cols; 2288 ny = mat->num_rows; 2289 } 2290 #endif 2291 } else { 2292 /* z = A^T x + beta y 2293 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2294 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2295 */ 2296 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2297 dptr = zarray; 2298 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2299 if (compressed) { /* Scatter x to work vector */ 2300 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2301 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2302 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2303 VecCUDAEqualsReverse()); 2304 } 2305 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2306 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2307 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2308 nx = mat->num_rows; 2309 ny = mat->num_cols; 2310 } 2311 #endif 2312 } 2313 2314 /* csr_spmv does y = alpha op(A) x + beta y */ 2315 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2316 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2317 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2318 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2319 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2320 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2321 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2322 matstruct->matDescr, 2323 matstruct->cuSpMV[opA].vecXDescr, beta, 2324 matstruct->cuSpMV[opA].vecYDescr, 2325 cusparse_scalartype, 2326 cusparsestruct->spmvAlg, 2327 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2328 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2329 2330 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2331 } else { 2332 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2333 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2334 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2335 } 2336 2337 stat = cusparseSpMV(cusparsestruct->handle, opA, 2338 matstruct->alpha_one, 2339 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */ 2340 matstruct->cuSpMV[opA].vecXDescr, 2341 beta, 2342 matstruct->cuSpMV[opA].vecYDescr, 2343 cusparse_scalartype, 2344 cusparsestruct->spmvAlg, 2345 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2346 #else 2347 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2348 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2349 mat->num_rows, mat->num_cols, 2350 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2351 mat->values->data().get(), mat->row_offsets->data().get(), 2352 mat->column_indices->data().get(), xptr, beta, 2353 dptr);CHKERRCUSPARSE(stat); 2354 #endif 2355 } else { 2356 if (cusparsestruct->nrows) { 2357 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2358 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2359 #else 2360 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2361 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2362 matstruct->alpha_one, matstruct->descr, hybMat, 2363 xptr, beta, 2364 dptr);CHKERRCUSPARSE(stat); 2365 #endif 2366 } 2367 } 2368 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2369 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2370 2371 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2372 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2373 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2374 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2375 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2376 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2377 } 2378 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2379 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2380 } 2381 2382 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2383 if (compressed) { 2384 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2385 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2386 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2387 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2388 VecCUDAPlusEquals()); 2389 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2390 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2391 } 2392 } else { 2393 if (yy && yy != zz) { 2394 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2395 } 2396 } 2397 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2398 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2399 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2400 } catch(char *ex) { 2401 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2402 } 2403 if (yy) { 2404 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2405 } else { 2406 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2407 } 2408 PetscFunctionReturn(0); 2409 } 2410 2411 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2412 { 2413 PetscErrorCode ierr; 2414 2415 PetscFunctionBegin; 2416 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2417 PetscFunctionReturn(0); 2418 } 2419 2420 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2421 { 2422 PetscErrorCode ierr; 2423 PetscSplitCSRDataStructure *d_mat = NULL, h_mat; 2424 PetscBool is_seq = PETSC_TRUE; 2425 PetscInt nnz_state = A->nonzerostate; 2426 PetscFunctionBegin; 2427 if (A->factortype == MAT_FACTOR_NONE) { 2428 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2429 } 2430 if (d_mat) { 2431 cudaError_t err; 2432 ierr = PetscInfo(A,"Assemble device matrix\n");CHKERRQ(ierr); 2433 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2434 nnz_state = h_mat.nonzerostate; 2435 is_seq = h_mat.seq; 2436 } 2437 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2438 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2439 if (A->factortype == MAT_FACTOR_NONE && A->nonzerostate >= nnz_state && is_seq) { // assembled on CPU eventhough equiped for GPU 2440 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2441 } else if (nnz_state > A->nonzerostate) { 2442 A->offloadmask = PETSC_OFFLOAD_GPU; 2443 } 2444 2445 PetscFunctionReturn(0); 2446 } 2447 2448 /* --------------------------------------------------------------------------------*/ 2449 /*@ 2450 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2451 (the default parallel PETSc format). This matrix will ultimately pushed down 2452 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2453 assembly performance the user should preallocate the matrix storage by setting 2454 the parameter nz (or the array nnz). By setting these parameters accurately, 2455 performance during matrix assembly can be increased by more than a factor of 50. 2456 2457 Collective 2458 2459 Input Parameters: 2460 + comm - MPI communicator, set to PETSC_COMM_SELF 2461 . m - number of rows 2462 . n - number of columns 2463 . nz - number of nonzeros per row (same for all rows) 2464 - nnz - array containing the number of nonzeros in the various rows 2465 (possibly different for each row) or NULL 2466 2467 Output Parameter: 2468 . A - the matrix 2469 2470 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2471 MatXXXXSetPreallocation() paradgm instead of this routine directly. 2472 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 2473 2474 Notes: 2475 If nnz is given then nz is ignored 2476 2477 The AIJ format (also called the Yale sparse matrix format or 2478 compressed row storage), is fully compatible with standard Fortran 77 2479 storage. That is, the stored row and column indices can begin at 2480 either one (as in Fortran) or zero. See the users' manual for details. 2481 2482 Specify the preallocated storage with either nz or nnz (not both). 2483 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 2484 allocation. For large problems you MUST preallocate memory or you 2485 will get TERRIBLE performance, see the users' manual chapter on matrices. 2486 2487 By default, this format uses inodes (identical nodes) when possible, to 2488 improve numerical efficiency of matrix-vector products and solves. We 2489 search for consecutive rows with the same nonzero structure, thereby 2490 reusing matrix information to achieve increased efficiency. 2491 2492 Level: intermediate 2493 2494 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 2495 @*/ 2496 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 2497 { 2498 PetscErrorCode ierr; 2499 2500 PetscFunctionBegin; 2501 ierr = MatCreate(comm,A);CHKERRQ(ierr); 2502 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 2503 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2504 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 2505 PetscFunctionReturn(0); 2506 } 2507 2508 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 2509 { 2510 PetscErrorCode ierr; 2511 PetscSplitCSRDataStructure *d_mat = NULL; 2512 2513 PetscFunctionBegin; 2514 if (A->factortype == MAT_FACTOR_NONE) { 2515 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2516 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 2517 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 2518 } else { 2519 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 2520 } 2521 if (d_mat) { 2522 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2523 cudaError_t err; 2524 PetscSplitCSRDataStructure h_mat; 2525 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 2526 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2527 if (h_mat.seq) { 2528 if (a->compressedrow.use) { 2529 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 2530 } 2531 err = cudaFree(d_mat);CHKERRCUDA(err); 2532 } 2533 } 2534 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 2535 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2536 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 2537 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 2538 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 2539 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 2540 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 2541 PetscFunctionReturn(0); 2542 } 2543 2544 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 2545 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 2546 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 2547 { 2548 PetscErrorCode ierr; 2549 2550 PetscFunctionBegin; 2551 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 2552 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 2553 PetscFunctionReturn(0); 2554 } 2555 2556 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 2557 { 2558 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2559 PetscErrorCode ierr; 2560 2561 PetscFunctionBegin; 2562 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 2563 if (flg) { 2564 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 2565 2566 A->ops->mult = MatMult_SeqAIJ; 2567 A->ops->multadd = MatMultAdd_SeqAIJ; 2568 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 2569 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 2570 A->ops->multhermitiantranspose = NULL; 2571 A->ops->multhermitiantransposeadd = NULL; 2572 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2573 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 2574 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 2575 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 2576 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 2577 } else { 2578 A->ops->mult = MatMult_SeqAIJCUSPARSE; 2579 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 2580 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 2581 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 2582 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 2583 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 2584 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2585 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2586 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 2587 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 2588 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 2589 } 2590 A->boundtocpu = flg; 2591 a->inode.use = flg; 2592 PetscFunctionReturn(0); 2593 } 2594 2595 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 2596 { 2597 PetscSplitCSRDataStructure *d_mat = NULL; 2598 PetscErrorCode ierr; 2599 PetscBool both = PETSC_FALSE; 2600 2601 PetscFunctionBegin; 2602 if (A->factortype == MAT_FACTOR_NONE) { 2603 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 2604 if (spptr->mat) { 2605 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 2606 if (matrix->values) { 2607 both = PETSC_TRUE; 2608 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 2609 } 2610 } 2611 if (spptr->matTranspose) { 2612 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 2613 if (matrix->values) { 2614 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 2615 } 2616 } 2617 d_mat = spptr->deviceMat; 2618 } 2619 if (d_mat) { 2620 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2621 PetscInt n = A->rmap->n, nnz = a->i[n]; 2622 cudaError_t err; 2623 PetscScalar *vals; 2624 ierr = PetscInfo(A,"Zero device matrix\n");CHKERRQ(ierr); 2625 err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2626 err = cudaMemset( vals, 0, (nnz)*sizeof(PetscScalar));CHKERRCUDA(err); 2627 } 2628 ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 2629 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2630 2631 PetscFunctionReturn(0); 2632 } 2633 2634 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 2635 { 2636 PetscErrorCode ierr; 2637 cusparseStatus_t stat; 2638 Mat B; 2639 2640 PetscFunctionBegin; 2641 if (reuse == MAT_INITIAL_MATRIX) { 2642 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 2643 } else if (reuse == MAT_REUSE_MATRIX) { 2644 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 2645 } 2646 B = *newmat; 2647 2648 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 2649 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 2650 2651 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 2652 if (B->factortype == MAT_FACTOR_NONE) { 2653 Mat_SeqAIJCUSPARSE *spptr; 2654 2655 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2656 spptr->format = MAT_CUSPARSE_CSR; 2657 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2658 B->spptr = spptr; 2659 spptr->deviceMat = NULL; 2660 } else { 2661 Mat_SeqAIJCUSPARSETriFactors *spptr; 2662 2663 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2664 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2665 B->spptr = spptr; 2666 } 2667 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2668 } 2669 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 2670 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 2671 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 2672 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 2673 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 2674 B->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 2675 2676 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 2677 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2678 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 2679 PetscFunctionReturn(0); 2680 } 2681 2682 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 2683 { 2684 PetscErrorCode ierr; 2685 2686 PetscFunctionBegin; 2687 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 2688 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 2689 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2690 ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr); 2691 ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr); 2692 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2693 PetscFunctionReturn(0); 2694 } 2695 2696 /*MC 2697 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 2698 2699 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 2700 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 2701 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 2702 2703 Options Database Keys: 2704 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 2705 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2706 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2707 2708 Level: beginner 2709 2710 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 2711 M*/ 2712 2713 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 2714 2715 2716 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 2717 { 2718 PetscErrorCode ierr; 2719 2720 PetscFunctionBegin; 2721 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2722 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2723 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2724 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2725 PetscFunctionReturn(0); 2726 } 2727 2728 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 2729 { 2730 PetscErrorCode ierr; 2731 cusparseStatus_t stat; 2732 2733 PetscFunctionBegin; 2734 if (*cusparsestruct) { 2735 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 2736 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 2737 delete (*cusparsestruct)->workVector; 2738 delete (*cusparsestruct)->rowoffsets_gpu; 2739 delete (*cusparsestruct)->cooPerm; 2740 delete (*cusparsestruct)->cooPerm_a; 2741 delete (*cusparsestruct)->cooPerm_v; 2742 delete (*cusparsestruct)->cooPerm_w; 2743 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 2744 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2745 cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr); 2746 #endif 2747 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 2748 } 2749 PetscFunctionReturn(0); 2750 } 2751 2752 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 2753 { 2754 PetscFunctionBegin; 2755 if (*mat) { 2756 delete (*mat)->values; 2757 delete (*mat)->column_indices; 2758 delete (*mat)->row_offsets; 2759 delete *mat; 2760 *mat = 0; 2761 } 2762 PetscFunctionReturn(0); 2763 } 2764 2765 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 2766 { 2767 cusparseStatus_t stat; 2768 PetscErrorCode ierr; 2769 2770 PetscFunctionBegin; 2771 if (*trifactor) { 2772 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 2773 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 2774 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 2775 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 2776 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 2777 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2778 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 2779 #endif 2780 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 2781 } 2782 PetscFunctionReturn(0); 2783 } 2784 2785 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 2786 { 2787 CsrMatrix *mat; 2788 cusparseStatus_t stat; 2789 cudaError_t err; 2790 2791 PetscFunctionBegin; 2792 if (*matstruct) { 2793 if ((*matstruct)->mat) { 2794 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 2795 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2796 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2797 #else 2798 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 2799 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 2800 #endif 2801 } else { 2802 mat = (CsrMatrix*)(*matstruct)->mat; 2803 CsrMatrix_Destroy(&mat); 2804 } 2805 } 2806 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 2807 delete (*matstruct)->cprowIndices; 2808 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 2809 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 2810 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 2811 2812 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2813 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 2814 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 2815 for (int i=0; i<3; i++) { 2816 if (mdata->cuSpMV[i].initialized) { 2817 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 2818 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 2819 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 2820 } 2821 } 2822 #endif 2823 delete *matstruct; 2824 *matstruct = NULL; 2825 } 2826 PetscFunctionReturn(0); 2827 } 2828 2829 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2830 { 2831 PetscErrorCode ierr; 2832 2833 PetscFunctionBegin; 2834 if (*trifactors) { 2835 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 2836 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 2837 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 2838 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 2839 delete (*trifactors)->rpermIndices; 2840 delete (*trifactors)->cpermIndices; 2841 delete (*trifactors)->workVector; 2842 (*trifactors)->rpermIndices = NULL; 2843 (*trifactors)->cpermIndices = NULL; 2844 (*trifactors)->workVector = NULL; 2845 } 2846 PetscFunctionReturn(0); 2847 } 2848 2849 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2850 { 2851 PetscErrorCode ierr; 2852 cusparseHandle_t handle; 2853 cusparseStatus_t stat; 2854 2855 PetscFunctionBegin; 2856 if (*trifactors) { 2857 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 2858 if (handle = (*trifactors)->handle) { 2859 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 2860 } 2861 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 2862 } 2863 PetscFunctionReturn(0); 2864 } 2865 2866 struct IJCompare 2867 { 2868 __host__ __device__ 2869 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 2870 { 2871 if (t1.get<0>() < t2.get<0>()) return true; 2872 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 2873 return false; 2874 } 2875 }; 2876 2877 struct IJEqual 2878 { 2879 __host__ __device__ 2880 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 2881 { 2882 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 2883 return true; 2884 } 2885 }; 2886 2887 struct IJDiff 2888 { 2889 __host__ __device__ 2890 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 2891 { 2892 return t1 == t2 ? 0 : 1; 2893 } 2894 }; 2895 2896 struct IJSum 2897 { 2898 __host__ __device__ 2899 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 2900 { 2901 return t1||t2; 2902 } 2903 }; 2904 2905 #include <thrust/iterator/discard_iterator.h> 2906 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 2907 { 2908 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2909 CsrMatrix *matrix; 2910 PetscErrorCode ierr; 2911 cudaError_t cerr; 2912 PetscInt n; 2913 2914 PetscFunctionBegin; 2915 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 2916 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 2917 if (!cusp->cooPerm) { 2918 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2919 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2920 PetscFunctionReturn(0); 2921 } 2922 n = cusp->cooPerm->size(); 2923 matrix = (CsrMatrix*)cusp->mat->mat; 2924 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 2925 if (!cusp->cooPerm_v) { cusp->cooPerm_v = new THRUSTARRAY(n); } 2926 ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 2927 if (v) { 2928 cusp->cooPerm_v->assign(v,v+n); 2929 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 2930 } 2931 else thrust::fill(thrust::device,cusp->cooPerm_v->begin(),cusp->cooPerm_v->end(),0.); 2932 if (imode == ADD_VALUES) { 2933 if (cusp->cooPerm_a) { 2934 if (!cusp->cooPerm_w) cusp->cooPerm_w = new THRUSTARRAY(matrix->values->size()); 2935 auto vbit = thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()); 2936 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cusp->cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 2937 thrust::transform(cusp->cooPerm_w->begin(),cusp->cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 2938 } else { 2939 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 2940 matrix->values->begin())); 2941 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 2942 matrix->values->end())); 2943 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 2944 } 2945 } else { 2946 if (cusp->cooPerm_a) { /* non unique values insertion, result is undefined (we cannot guarantee last takes precedence) 2947 if we are inserting two different values into the same location */ 2948 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 2949 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->begin()))); 2950 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 2951 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->end()))); 2952 thrust::for_each(zibit,zieit,VecCUDAEquals()); 2953 } else { 2954 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 2955 matrix->values->begin())); 2956 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 2957 matrix->values->end())); 2958 thrust::for_each(zibit,zieit,VecCUDAEquals()); 2959 } 2960 } 2961 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2962 ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 2963 A->offloadmask = PETSC_OFFLOAD_GPU; 2964 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2965 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2966 /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */ 2967 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 2968 PetscFunctionReturn(0); 2969 } 2970 2971 #include <thrust/binary_search.h> 2972 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 2973 { 2974 PetscErrorCode ierr; 2975 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2976 CsrMatrix *matrix; 2977 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2978 PetscInt cooPerm_n, nzr = 0; 2979 cudaError_t cerr; 2980 2981 PetscFunctionBegin; 2982 ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr); 2983 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 2984 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 2985 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 2986 if (n != cooPerm_n) { 2987 delete cusp->cooPerm; 2988 delete cusp->cooPerm_v; 2989 delete cusp->cooPerm_w; 2990 delete cusp->cooPerm_a; 2991 cusp->cooPerm = NULL; 2992 cusp->cooPerm_v = NULL; 2993 cusp->cooPerm_w = NULL; 2994 cusp->cooPerm_a = NULL; 2995 } 2996 if (n) { 2997 THRUSTINTARRAY d_i(n); 2998 THRUSTINTARRAY d_j(n); 2999 THRUSTINTARRAY ii(A->rmap->n); 3000 3001 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3002 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3003 3004 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3005 d_i.assign(coo_i,coo_i+n); 3006 d_j.assign(coo_j,coo_j+n); 3007 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3008 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3009 3010 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3011 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 3012 *cusp->cooPerm_a = d_i; 3013 THRUSTINTARRAY w = d_j; 3014 3015 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 3016 if (nekey == ekey) { /* all entries are unique */ 3017 delete cusp->cooPerm_a; 3018 cusp->cooPerm_a = NULL; 3019 } else { /* I couldn't come up with a more elegant algorithm */ 3020 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 3021 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 3022 (*cusp->cooPerm_a)[0] = 0; 3023 w[0] = 0; 3024 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 3025 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 3026 } 3027 thrust::counting_iterator<PetscInt> search_begin(0); 3028 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 3029 search_begin, search_begin + A->rmap->n, 3030 ii.begin()); 3031 3032 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3033 a->singlemalloc = PETSC_FALSE; 3034 a->free_a = PETSC_TRUE; 3035 a->free_ij = PETSC_TRUE; 3036 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3037 a->i[0] = 0; 3038 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3039 a->nz = a->maxnz = a->i[A->rmap->n]; 3040 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3041 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3042 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3043 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3044 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3045 for (PetscInt i = 0; i < A->rmap->n; i++) { 3046 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3047 nzr += (PetscInt)!!(nnzr); 3048 a->ilen[i] = a->imax[i] = nnzr; 3049 } 3050 A->preallocated = PETSC_TRUE; 3051 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3052 } else { 3053 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3054 } 3055 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3056 ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr); 3057 3058 /* We want to allocate the CUSPARSE struct for matvec now. 3059 The code is so convoluted now that I prefer to copy garbage to the GPU */ 3060 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3061 A->offloadmask = PETSC_OFFLOAD_CPU; 3062 A->nonzerostate++; 3063 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3064 { 3065 matrix = (CsrMatrix*)cusp->mat->mat; 3066 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3067 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3068 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3069 } 3070 3071 A->offloadmask = PETSC_OFFLOAD_CPU; 3072 A->assembled = PETSC_FALSE; 3073 A->was_assembled = PETSC_FALSE; 3074 PetscFunctionReturn(0); 3075 } 3076