1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 68 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 69 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 74 75 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 76 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 78 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 80 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 81 82 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 83 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 84 85 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 86 { 87 cusparseStatus_t stat; 88 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 89 90 PetscFunctionBegin; 91 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 92 cusparsestruct->stream = stream; 93 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 94 PetscFunctionReturn(0); 95 } 96 97 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 98 { 99 cusparseStatus_t stat; 100 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 101 102 PetscFunctionBegin; 103 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 104 if (cusparsestruct->handle != handle) { 105 if (cusparsestruct->handle) { 106 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 107 } 108 cusparsestruct->handle = handle; 109 } 110 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 111 PetscFunctionReturn(0); 112 } 113 114 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 115 { 116 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 117 PetscBool flg; 118 PetscErrorCode ierr; 119 120 PetscFunctionBegin; 121 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 122 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 123 if (cusparsestruct->handle) cusparsestruct->handle = 0; 124 PetscFunctionReturn(0); 125 } 126 127 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 128 { 129 PetscFunctionBegin; 130 *type = MATSOLVERCUSPARSE; 131 PetscFunctionReturn(0); 132 } 133 134 /*MC 135 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 136 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 137 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 138 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 139 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 140 algorithms are not recommended. This class does NOT support direct solver operations. 141 142 Level: beginner 143 144 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 145 M*/ 146 147 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 148 { 149 PetscErrorCode ierr; 150 PetscInt n = A->rmap->n; 151 152 PetscFunctionBegin; 153 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 154 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 155 (*B)->factortype = ftype; 156 (*B)->useordering = PETSC_TRUE; 157 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 158 159 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 160 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 161 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 162 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 163 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 164 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 165 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 166 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 167 168 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 169 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 170 PetscFunctionReturn(0); 171 } 172 173 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 174 { 175 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 176 177 PetscFunctionBegin; 178 switch (op) { 179 case MAT_CUSPARSE_MULT: 180 cusparsestruct->format = format; 181 break; 182 case MAT_CUSPARSE_ALL: 183 cusparsestruct->format = format; 184 break; 185 default: 186 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 187 } 188 PetscFunctionReturn(0); 189 } 190 191 /*@ 192 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 193 operation. Only the MatMult operation can use different GPU storage formats 194 for MPIAIJCUSPARSE matrices. 195 Not Collective 196 197 Input Parameters: 198 + A - Matrix of type SEQAIJCUSPARSE 199 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 200 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 201 202 Output Parameter: 203 204 Level: intermediate 205 206 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 207 @*/ 208 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 209 { 210 PetscErrorCode ierr; 211 212 PetscFunctionBegin; 213 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 214 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 215 PetscFunctionReturn(0); 216 } 217 218 /*@ 219 MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose 220 221 Collective on mat 222 223 Input Parameters: 224 + A - Matrix of type SEQAIJCUSPARSE 225 - transgen - the boolean flag 226 227 Level: intermediate 228 229 .seealso: MATSEQAIJCUSPARSE 230 @*/ 231 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 232 { 233 PetscErrorCode ierr; 234 PetscBool flg; 235 236 PetscFunctionBegin; 237 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 238 ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 239 if (flg) { 240 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 241 242 if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 243 cusp->transgen = transgen; 244 if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 245 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 246 } 247 } 248 PetscFunctionReturn(0); 249 } 250 251 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 252 { 253 PetscErrorCode ierr; 254 MatCUSPARSEStorageFormat format; 255 PetscBool flg; 256 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 257 258 PetscFunctionBegin; 259 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 260 if (A->factortype == MAT_FACTOR_NONE) { 261 PetscBool transgen = cusparsestruct->transgen; 262 263 ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 264 if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);} 265 266 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 267 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 268 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 269 270 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 271 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 272 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 273 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 274 cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 275 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 276 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 277 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 278 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 279 280 cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 281 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 282 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 283 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 284 285 cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 286 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 287 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 288 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 289 #endif 290 } 291 ierr = PetscOptionsTail();CHKERRQ(ierr); 292 PetscFunctionReturn(0); 293 } 294 295 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 296 { 297 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 298 PetscErrorCode ierr; 299 300 PetscFunctionBegin; 301 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 302 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 303 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 304 PetscFunctionReturn(0); 305 } 306 307 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 308 { 309 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 310 PetscErrorCode ierr; 311 312 PetscFunctionBegin; 313 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 314 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 315 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 316 PetscFunctionReturn(0); 317 } 318 319 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 320 { 321 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 322 PetscErrorCode ierr; 323 324 PetscFunctionBegin; 325 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 326 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 327 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 328 PetscFunctionReturn(0); 329 } 330 331 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 332 { 333 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 334 PetscErrorCode ierr; 335 336 PetscFunctionBegin; 337 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 338 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 339 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 340 PetscFunctionReturn(0); 341 } 342 343 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 344 { 345 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 346 PetscInt n = A->rmap->n; 347 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 348 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 349 cusparseStatus_t stat; 350 const PetscInt *ai = a->i,*aj = a->j,*vi; 351 const MatScalar *aa = a->a,*v; 352 PetscInt *AiLo, *AjLo; 353 PetscScalar *AALo; 354 PetscInt i,nz, nzLower, offset, rowOffset; 355 PetscErrorCode ierr; 356 cudaError_t cerr; 357 358 PetscFunctionBegin; 359 if (!n) PetscFunctionReturn(0); 360 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 361 try { 362 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 363 nzLower=n+ai[n]-ai[1]; 364 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 365 if (!loTriFactor) { 366 367 /* Allocate Space for the lower triangular matrix */ 368 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 369 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 370 371 /* Fill the lower triangular matrix */ 372 AiLo[0] = (PetscInt) 0; 373 AiLo[n] = nzLower; 374 AjLo[0] = (PetscInt) 0; 375 AALo[0] = (MatScalar) 1.0; 376 v = aa; 377 vi = aj; 378 offset = 1; 379 rowOffset= 1; 380 for (i=1; i<n; i++) { 381 nz = ai[i+1] - ai[i]; 382 /* additional 1 for the term on the diagonal */ 383 AiLo[i] = rowOffset; 384 rowOffset += nz+1; 385 386 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 387 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 388 389 offset += nz; 390 AjLo[offset] = (PetscInt) i; 391 AALo[offset] = (MatScalar) 1.0; 392 offset += 1; 393 394 v += nz; 395 vi += nz; 396 } 397 398 /* allocate space for the triangular factor information */ 399 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 400 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 401 /* Create the matrix description */ 402 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 403 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 404 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 405 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 406 #else 407 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 408 #endif 409 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 410 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 411 412 /* set the operation */ 413 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 414 415 /* set the matrix */ 416 loTriFactor->csrMat = new CsrMatrix; 417 loTriFactor->csrMat->num_rows = n; 418 loTriFactor->csrMat->num_cols = n; 419 loTriFactor->csrMat->num_entries = nzLower; 420 421 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 422 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 423 424 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 425 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 426 427 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 428 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 429 430 /* Create the solve analysis information */ 431 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 432 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 433 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 434 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 435 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 436 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 437 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 438 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 439 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 440 #endif 441 442 /* perform the solve analysis */ 443 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 444 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 445 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 446 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 447 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 448 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 449 #endif 450 );CHKERRCUSPARSE(stat); 451 cerr = WaitForCUDA();CHKERRCUDA(cerr); 452 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 453 454 /* assign the pointer */ 455 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 456 457 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 458 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 459 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 460 } else { /* update values only */ 461 /* Fill the lower triangular matrix */ 462 AALo[0] = 1.0; 463 v = aa; 464 vi = aj; 465 offset = 1; 466 for (i=1; i<n; i++) { 467 nz = ai[i+1] - ai[i]; 468 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 469 offset += nz; 470 AALo[offset] = 1.0; 471 offset += 1; 472 v += nz; 473 } 474 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 475 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 476 } 477 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 478 } catch(char *ex) { 479 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 480 } 481 } 482 PetscFunctionReturn(0); 483 } 484 485 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 486 { 487 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 488 PetscInt n = A->rmap->n; 489 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 490 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 491 cusparseStatus_t stat; 492 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 493 const MatScalar *aa = a->a,*v; 494 PetscInt *AiUp, *AjUp; 495 PetscScalar *AAUp; 496 PetscInt i,nz, nzUpper, offset; 497 PetscErrorCode ierr; 498 cudaError_t cerr; 499 500 PetscFunctionBegin; 501 if (!n) PetscFunctionReturn(0); 502 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 503 try { 504 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 505 nzUpper = adiag[0]-adiag[n]; 506 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 507 if (!upTriFactor) { 508 /* Allocate Space for the upper triangular matrix */ 509 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 510 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 511 512 /* Fill the upper triangular matrix */ 513 AiUp[0]=(PetscInt) 0; 514 AiUp[n]=nzUpper; 515 offset = nzUpper; 516 for (i=n-1; i>=0; i--) { 517 v = aa + adiag[i+1] + 1; 518 vi = aj + adiag[i+1] + 1; 519 520 /* number of elements NOT on the diagonal */ 521 nz = adiag[i] - adiag[i+1]-1; 522 523 /* decrement the offset */ 524 offset -= (nz+1); 525 526 /* first, set the diagonal elements */ 527 AjUp[offset] = (PetscInt) i; 528 AAUp[offset] = (MatScalar)1./v[nz]; 529 AiUp[i] = AiUp[i+1] - (nz+1); 530 531 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 532 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 533 } 534 535 /* allocate space for the triangular factor information */ 536 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 537 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 538 539 /* Create the matrix description */ 540 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 541 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 542 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 543 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 544 #else 545 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 546 #endif 547 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 548 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 549 550 /* set the operation */ 551 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 552 553 /* set the matrix */ 554 upTriFactor->csrMat = new CsrMatrix; 555 upTriFactor->csrMat->num_rows = n; 556 upTriFactor->csrMat->num_cols = n; 557 upTriFactor->csrMat->num_entries = nzUpper; 558 559 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 560 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 561 562 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 563 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 564 565 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 566 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 567 568 /* Create the solve analysis information */ 569 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 570 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 571 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 572 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 573 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 574 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 575 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 576 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 577 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 578 #endif 579 580 /* perform the solve analysis */ 581 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 582 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 583 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 584 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 585 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 586 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 587 #endif 588 );CHKERRCUSPARSE(stat); 589 cerr = WaitForCUDA();CHKERRCUDA(cerr); 590 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 591 592 /* assign the pointer */ 593 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 594 595 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 596 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 597 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 598 } else { 599 /* Fill the upper triangular matrix */ 600 offset = nzUpper; 601 for (i=n-1; i>=0; i--) { 602 v = aa + adiag[i+1] + 1; 603 604 /* number of elements NOT on the diagonal */ 605 nz = adiag[i] - adiag[i+1]-1; 606 607 /* decrement the offset */ 608 offset -= (nz+1); 609 610 /* first, set the diagonal elements */ 611 AAUp[offset] = 1./v[nz]; 612 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 613 } 614 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 615 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 616 } 617 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 618 } catch(char *ex) { 619 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 620 } 621 } 622 PetscFunctionReturn(0); 623 } 624 625 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 626 { 627 PetscErrorCode ierr; 628 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 629 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 630 IS isrow = a->row,iscol = a->icol; 631 PetscBool row_identity,col_identity; 632 PetscInt n = A->rmap->n; 633 634 PetscFunctionBegin; 635 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 636 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 637 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 638 639 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 640 cusparseTriFactors->nnz=a->nz; 641 642 A->offloadmask = PETSC_OFFLOAD_BOTH; 643 /* lower triangular indices */ 644 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 645 if (!row_identity && !cusparseTriFactors->rpermIndices) { 646 const PetscInt *r; 647 648 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 649 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 650 cusparseTriFactors->rpermIndices->assign(r, r+n); 651 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 652 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 653 } 654 655 /* upper triangular indices */ 656 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 657 if (!col_identity && !cusparseTriFactors->cpermIndices) { 658 const PetscInt *c; 659 660 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 661 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 662 cusparseTriFactors->cpermIndices->assign(c, c+n); 663 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 664 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 665 } 666 PetscFunctionReturn(0); 667 } 668 669 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 670 { 671 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 672 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 673 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 674 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 675 cusparseStatus_t stat; 676 PetscErrorCode ierr; 677 cudaError_t cerr; 678 PetscInt *AiUp, *AjUp; 679 PetscScalar *AAUp; 680 PetscScalar *AALo; 681 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 682 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 683 const PetscInt *ai = b->i,*aj = b->j,*vj; 684 const MatScalar *aa = b->a,*v; 685 686 PetscFunctionBegin; 687 if (!n) PetscFunctionReturn(0); 688 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 689 try { 690 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 691 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 692 if (!upTriFactor && !loTriFactor) { 693 /* Allocate Space for the upper triangular matrix */ 694 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 695 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 696 697 /* Fill the upper triangular matrix */ 698 AiUp[0]=(PetscInt) 0; 699 AiUp[n]=nzUpper; 700 offset = 0; 701 for (i=0; i<n; i++) { 702 /* set the pointers */ 703 v = aa + ai[i]; 704 vj = aj + ai[i]; 705 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 706 707 /* first, set the diagonal elements */ 708 AjUp[offset] = (PetscInt) i; 709 AAUp[offset] = (MatScalar)1.0/v[nz]; 710 AiUp[i] = offset; 711 AALo[offset] = (MatScalar)1.0/v[nz]; 712 713 offset+=1; 714 if (nz>0) { 715 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 716 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 717 for (j=offset; j<offset+nz; j++) { 718 AAUp[j] = -AAUp[j]; 719 AALo[j] = AAUp[j]/v[nz]; 720 } 721 offset+=nz; 722 } 723 } 724 725 /* allocate space for the triangular factor information */ 726 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 727 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 728 729 /* Create the matrix description */ 730 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 731 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 732 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 733 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 734 #else 735 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 736 #endif 737 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 738 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 739 740 /* set the matrix */ 741 upTriFactor->csrMat = new CsrMatrix; 742 upTriFactor->csrMat->num_rows = A->rmap->n; 743 upTriFactor->csrMat->num_cols = A->cmap->n; 744 upTriFactor->csrMat->num_entries = a->nz; 745 746 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 747 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 748 749 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 750 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 751 752 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 753 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 754 755 /* set the operation */ 756 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 757 758 /* Create the solve analysis information */ 759 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 760 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 761 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 762 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 763 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 764 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 765 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 766 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 767 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 768 #endif 769 770 /* perform the solve analysis */ 771 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 772 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 773 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 774 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 775 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 776 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 777 #endif 778 );CHKERRCUSPARSE(stat); 779 cerr = WaitForCUDA();CHKERRCUDA(cerr); 780 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 781 782 /* assign the pointer */ 783 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 784 785 /* allocate space for the triangular factor information */ 786 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 787 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 788 789 /* Create the matrix description */ 790 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 791 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 792 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 793 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 794 #else 795 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 796 #endif 797 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 798 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 799 800 /* set the operation */ 801 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 802 803 /* set the matrix */ 804 loTriFactor->csrMat = new CsrMatrix; 805 loTriFactor->csrMat->num_rows = A->rmap->n; 806 loTriFactor->csrMat->num_cols = A->cmap->n; 807 loTriFactor->csrMat->num_entries = a->nz; 808 809 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 810 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 811 812 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 813 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 814 815 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 816 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 817 818 /* Create the solve analysis information */ 819 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 820 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 821 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 822 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 823 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 824 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 825 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 826 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 827 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 828 #endif 829 830 /* perform the solve analysis */ 831 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 832 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 833 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 834 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 835 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 836 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 837 #endif 838 );CHKERRCUSPARSE(stat); 839 cerr = WaitForCUDA();CHKERRCUDA(cerr); 840 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 841 842 /* assign the pointer */ 843 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 844 845 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 846 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 847 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 848 } else { 849 /* Fill the upper triangular matrix */ 850 offset = 0; 851 for (i=0; i<n; i++) { 852 /* set the pointers */ 853 v = aa + ai[i]; 854 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 855 856 /* first, set the diagonal elements */ 857 AAUp[offset] = 1.0/v[nz]; 858 AALo[offset] = 1.0/v[nz]; 859 860 offset+=1; 861 if (nz>0) { 862 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 863 for (j=offset; j<offset+nz; j++) { 864 AAUp[j] = -AAUp[j]; 865 AALo[j] = AAUp[j]/v[nz]; 866 } 867 offset+=nz; 868 } 869 } 870 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 871 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 872 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 873 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 874 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 875 } 876 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 877 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 878 } catch(char *ex) { 879 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 880 } 881 } 882 PetscFunctionReturn(0); 883 } 884 885 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 886 { 887 PetscErrorCode ierr; 888 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 889 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 890 IS ip = a->row; 891 PetscBool perm_identity; 892 PetscInt n = A->rmap->n; 893 894 PetscFunctionBegin; 895 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 896 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 897 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 898 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 899 900 A->offloadmask = PETSC_OFFLOAD_BOTH; 901 902 /* lower triangular indices */ 903 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 904 if (!perm_identity) { 905 IS iip; 906 const PetscInt *irip,*rip; 907 908 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 909 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 910 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 911 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 912 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 913 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 914 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 915 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 916 ierr = ISDestroy(&iip);CHKERRQ(ierr); 917 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 918 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 919 } 920 PetscFunctionReturn(0); 921 } 922 923 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 924 { 925 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 926 IS isrow = b->row,iscol = b->col; 927 PetscBool row_identity,col_identity; 928 PetscErrorCode ierr; 929 930 PetscFunctionBegin; 931 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 932 B->offloadmask = PETSC_OFFLOAD_CPU; 933 /* determine which version of MatSolve needs to be used. */ 934 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 935 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 936 if (row_identity && col_identity) { 937 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 938 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 939 B->ops->matsolve = NULL; 940 B->ops->matsolvetranspose = NULL; 941 } else { 942 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 943 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 944 B->ops->matsolve = NULL; 945 B->ops->matsolvetranspose = NULL; 946 } 947 948 /* get the triangular factors */ 949 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 950 PetscFunctionReturn(0); 951 } 952 953 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 954 { 955 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 956 IS ip = b->row; 957 PetscBool perm_identity; 958 PetscErrorCode ierr; 959 960 PetscFunctionBegin; 961 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 962 B->offloadmask = PETSC_OFFLOAD_CPU; 963 /* determine which version of MatSolve needs to be used. */ 964 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 965 if (perm_identity) { 966 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 967 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 968 B->ops->matsolve = NULL; 969 B->ops->matsolvetranspose = NULL; 970 } else { 971 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 972 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 973 B->ops->matsolve = NULL; 974 B->ops->matsolvetranspose = NULL; 975 } 976 977 /* get the triangular factors */ 978 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 979 PetscFunctionReturn(0); 980 } 981 982 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 983 { 984 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 985 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 986 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 987 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 988 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 989 cusparseStatus_t stat; 990 cusparseIndexBase_t indexBase; 991 cusparseMatrixType_t matrixType; 992 cusparseFillMode_t fillMode; 993 cusparseDiagType_t diagType; 994 cudaError_t cerr; 995 PetscErrorCode ierr; 996 997 PetscFunctionBegin; 998 /* allocate space for the transpose of the lower triangular factor */ 999 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1000 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1001 1002 /* set the matrix descriptors of the lower triangular factor */ 1003 matrixType = cusparseGetMatType(loTriFactor->descr); 1004 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1005 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1006 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1007 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1008 1009 /* Create the matrix description */ 1010 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1011 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1012 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1013 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1014 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1015 1016 /* set the operation */ 1017 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1018 1019 /* allocate GPU space for the CSC of the lower triangular factor*/ 1020 loTriFactorT->csrMat = new CsrMatrix; 1021 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1022 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1023 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1024 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1025 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1026 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1027 1028 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1029 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1030 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1031 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1032 loTriFactor->csrMat->values->data().get(), 1033 loTriFactor->csrMat->row_offsets->data().get(), 1034 loTriFactor->csrMat->column_indices->data().get(), 1035 loTriFactorT->csrMat->values->data().get(), 1036 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1037 CUSPARSE_ACTION_NUMERIC,indexBase, 1038 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1039 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1040 #endif 1041 1042 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1043 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1044 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1045 loTriFactor->csrMat->values->data().get(), 1046 loTriFactor->csrMat->row_offsets->data().get(), 1047 loTriFactor->csrMat->column_indices->data().get(), 1048 loTriFactorT->csrMat->values->data().get(), 1049 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1050 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1051 CUSPARSE_ACTION_NUMERIC, indexBase, 1052 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 1053 #else 1054 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1055 CUSPARSE_ACTION_NUMERIC, indexBase 1056 #endif 1057 );CHKERRCUSPARSE(stat); 1058 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1059 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1060 1061 /* Create the solve analysis information */ 1062 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1063 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1064 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1065 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1066 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1067 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1068 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1069 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1070 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1071 #endif 1072 1073 /* perform the solve analysis */ 1074 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1075 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1076 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1077 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 1078 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1079 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1080 #endif 1081 );CHKERRCUSPARSE(stat); 1082 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1083 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1084 1085 /* assign the pointer */ 1086 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1087 1088 /*********************************************/ 1089 /* Now the Transpose of the Upper Tri Factor */ 1090 /*********************************************/ 1091 1092 /* allocate space for the transpose of the upper triangular factor */ 1093 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1094 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1095 1096 /* set the matrix descriptors of the upper triangular factor */ 1097 matrixType = cusparseGetMatType(upTriFactor->descr); 1098 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1099 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1100 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1101 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1102 1103 /* Create the matrix description */ 1104 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1105 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1106 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1107 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1108 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1109 1110 /* set the operation */ 1111 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1112 1113 /* allocate GPU space for the CSC of the upper triangular factor*/ 1114 upTriFactorT->csrMat = new CsrMatrix; 1115 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1116 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1117 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1118 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1119 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1120 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1121 1122 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1123 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1124 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1125 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1126 upTriFactor->csrMat->values->data().get(), 1127 upTriFactor->csrMat->row_offsets->data().get(), 1128 upTriFactor->csrMat->column_indices->data().get(), 1129 upTriFactorT->csrMat->values->data().get(), 1130 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1131 CUSPARSE_ACTION_NUMERIC,indexBase, 1132 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1133 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1134 #endif 1135 1136 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1137 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1138 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1139 upTriFactor->csrMat->values->data().get(), 1140 upTriFactor->csrMat->row_offsets->data().get(), 1141 upTriFactor->csrMat->column_indices->data().get(), 1142 upTriFactorT->csrMat->values->data().get(), 1143 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1144 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1145 CUSPARSE_ACTION_NUMERIC, indexBase, 1146 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1147 #else 1148 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1149 CUSPARSE_ACTION_NUMERIC, indexBase 1150 #endif 1151 );CHKERRCUSPARSE(stat); 1152 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1153 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1154 1155 /* Create the solve analysis information */ 1156 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1157 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1158 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1159 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1160 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1161 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1162 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1163 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1164 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1165 #endif 1166 1167 /* perform the solve analysis */ 1168 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1169 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1170 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1171 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1172 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1173 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1174 #endif 1175 );CHKERRCUSPARSE(stat); 1176 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1177 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1178 1179 /* assign the pointer */ 1180 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1181 PetscFunctionReturn(0); 1182 } 1183 1184 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 1185 { 1186 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1187 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1188 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1189 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1190 cusparseStatus_t stat; 1191 cusparseIndexBase_t indexBase; 1192 cudaError_t err; 1193 PetscErrorCode ierr; 1194 1195 PetscFunctionBegin; 1196 if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0); 1197 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1198 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1199 /* create cusparse matrix */ 1200 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1201 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1202 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1203 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1204 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1205 1206 /* set alpha and beta */ 1207 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1208 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1209 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1210 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1211 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1212 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1213 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1214 1215 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1216 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1217 CsrMatrix *matrixT= new CsrMatrix; 1218 matrixT->num_rows = A->cmap->n; 1219 matrixT->num_cols = A->rmap->n; 1220 matrixT->num_entries = a->nz; 1221 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1222 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1223 matrixT->values = new THRUSTARRAY(a->nz); 1224 1225 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 1226 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1227 1228 /* compute the transpose, i.e. the CSC */ 1229 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1230 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1231 A->cmap->n, matrix->num_entries, 1232 matrix->values->data().get(), 1233 cusparsestruct->rowoffsets_gpu->data().get(), 1234 matrix->column_indices->data().get(), 1235 matrixT->values->data().get(), 1236 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1237 CUSPARSE_ACTION_NUMERIC,indexBase, 1238 cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1239 err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err); 1240 #endif 1241 1242 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1243 A->cmap->n, matrix->num_entries, 1244 matrix->values->data().get(), 1245 cusparsestruct->rowoffsets_gpu->data().get(), 1246 matrix->column_indices->data().get(), 1247 matrixT->values->data().get(), 1248 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1249 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1250 CUSPARSE_ACTION_NUMERIC,indexBase, 1251 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1252 #else 1253 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1254 CUSPARSE_ACTION_NUMERIC, indexBase 1255 #endif 1256 );CHKERRCUSPARSE(stat); 1257 matstructT->mat = matrixT; 1258 1259 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1260 stat = cusparseCreateCsr(&matstructT->matDescr, 1261 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1262 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1263 matrixT->values->data().get(), 1264 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1265 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1266 #endif 1267 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1268 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1269 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1270 #else 1271 CsrMatrix *temp = new CsrMatrix; 1272 CsrMatrix *tempT = new CsrMatrix; 1273 /* First convert HYB to CSR */ 1274 temp->num_rows = A->rmap->n; 1275 temp->num_cols = A->cmap->n; 1276 temp->num_entries = a->nz; 1277 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1278 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1279 temp->values = new THRUSTARRAY(a->nz); 1280 1281 stat = cusparse_hyb2csr(cusparsestruct->handle, 1282 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1283 temp->values->data().get(), 1284 temp->row_offsets->data().get(), 1285 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1286 1287 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1288 tempT->num_rows = A->rmap->n; 1289 tempT->num_cols = A->cmap->n; 1290 tempT->num_entries = a->nz; 1291 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1292 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1293 tempT->values = new THRUSTARRAY(a->nz); 1294 1295 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1296 temp->num_cols, temp->num_entries, 1297 temp->values->data().get(), 1298 temp->row_offsets->data().get(), 1299 temp->column_indices->data().get(), 1300 tempT->values->data().get(), 1301 tempT->column_indices->data().get(), 1302 tempT->row_offsets->data().get(), 1303 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1304 1305 /* Last, convert CSC to HYB */ 1306 cusparseHybMat_t hybMat; 1307 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1308 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1309 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1310 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1311 matstructT->descr, tempT->values->data().get(), 1312 tempT->row_offsets->data().get(), 1313 tempT->column_indices->data().get(), 1314 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1315 1316 /* assign the pointer */ 1317 matstructT->mat = hybMat; 1318 /* delete temporaries */ 1319 if (tempT) { 1320 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1321 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1322 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1323 delete (CsrMatrix*) tempT; 1324 } 1325 if (temp) { 1326 if (temp->values) delete (THRUSTARRAY*) temp->values; 1327 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1328 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1329 delete (CsrMatrix*) temp; 1330 } 1331 #endif 1332 } 1333 err = WaitForCUDA();CHKERRCUDA(err); 1334 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1335 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1336 /* the compressed row indices is not used for matTranspose */ 1337 matstructT->cprowIndices = NULL; 1338 /* assign the pointer */ 1339 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1340 PetscFunctionReturn(0); 1341 } 1342 1343 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1344 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1345 { 1346 PetscInt n = xx->map->n; 1347 const PetscScalar *barray; 1348 PetscScalar *xarray; 1349 thrust::device_ptr<const PetscScalar> bGPU; 1350 thrust::device_ptr<PetscScalar> xGPU; 1351 cusparseStatus_t stat; 1352 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1353 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1354 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1355 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1356 PetscErrorCode ierr; 1357 cudaError_t cerr; 1358 1359 PetscFunctionBegin; 1360 /* Analyze the matrix and create the transpose ... on the fly */ 1361 if (!loTriFactorT && !upTriFactorT) { 1362 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1363 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1364 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1365 } 1366 1367 /* Get the GPU pointers */ 1368 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1369 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1370 xGPU = thrust::device_pointer_cast(xarray); 1371 bGPU = thrust::device_pointer_cast(barray); 1372 1373 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1374 /* First, reorder with the row permutation */ 1375 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1376 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1377 xGPU); 1378 1379 /* First, solve U */ 1380 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1381 upTriFactorT->csrMat->num_rows, 1382 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1383 upTriFactorT->csrMat->num_entries, 1384 #endif 1385 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1386 upTriFactorT->csrMat->values->data().get(), 1387 upTriFactorT->csrMat->row_offsets->data().get(), 1388 upTriFactorT->csrMat->column_indices->data().get(), 1389 upTriFactorT->solveInfo, 1390 xarray, tempGPU->data().get() 1391 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1392 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1393 #endif 1394 );CHKERRCUSPARSE(stat); 1395 1396 /* Then, solve L */ 1397 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1398 loTriFactorT->csrMat->num_rows, 1399 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1400 loTriFactorT->csrMat->num_entries, 1401 #endif 1402 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1403 loTriFactorT->csrMat->values->data().get(), 1404 loTriFactorT->csrMat->row_offsets->data().get(), 1405 loTriFactorT->csrMat->column_indices->data().get(), 1406 loTriFactorT->solveInfo, 1407 tempGPU->data().get(), xarray 1408 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1409 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1410 #endif 1411 );CHKERRCUSPARSE(stat); 1412 1413 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1414 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1415 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1416 tempGPU->begin()); 1417 1418 /* Copy the temporary to the full solution. */ 1419 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1420 1421 /* restore */ 1422 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1423 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1424 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1425 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1426 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1427 PetscFunctionReturn(0); 1428 } 1429 1430 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1431 { 1432 const PetscScalar *barray; 1433 PetscScalar *xarray; 1434 cusparseStatus_t stat; 1435 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1436 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1437 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1438 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1439 PetscErrorCode ierr; 1440 cudaError_t cerr; 1441 1442 PetscFunctionBegin; 1443 /* Analyze the matrix and create the transpose ... on the fly */ 1444 if (!loTriFactorT && !upTriFactorT) { 1445 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1446 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1447 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1448 } 1449 1450 /* Get the GPU pointers */ 1451 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1452 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1453 1454 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1455 /* First, solve U */ 1456 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1457 upTriFactorT->csrMat->num_rows, 1458 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1459 upTriFactorT->csrMat->num_entries, 1460 #endif 1461 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1462 upTriFactorT->csrMat->values->data().get(), 1463 upTriFactorT->csrMat->row_offsets->data().get(), 1464 upTriFactorT->csrMat->column_indices->data().get(), 1465 upTriFactorT->solveInfo, 1466 barray, tempGPU->data().get() 1467 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1468 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1469 #endif 1470 );CHKERRCUSPARSE(stat); 1471 1472 /* Then, solve L */ 1473 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1474 loTriFactorT->csrMat->num_rows, 1475 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1476 loTriFactorT->csrMat->num_entries, 1477 #endif 1478 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1479 loTriFactorT->csrMat->values->data().get(), 1480 loTriFactorT->csrMat->row_offsets->data().get(), 1481 loTriFactorT->csrMat->column_indices->data().get(), 1482 loTriFactorT->solveInfo, 1483 tempGPU->data().get(), xarray 1484 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1485 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1486 #endif 1487 );CHKERRCUSPARSE(stat); 1488 1489 /* restore */ 1490 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1491 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1492 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1493 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1494 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1495 PetscFunctionReturn(0); 1496 } 1497 1498 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1499 { 1500 const PetscScalar *barray; 1501 PetscScalar *xarray; 1502 thrust::device_ptr<const PetscScalar> bGPU; 1503 thrust::device_ptr<PetscScalar> xGPU; 1504 cusparseStatus_t stat; 1505 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1506 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1507 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1508 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1509 PetscErrorCode ierr; 1510 cudaError_t cerr; 1511 1512 PetscFunctionBegin; 1513 1514 /* Get the GPU pointers */ 1515 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1516 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1517 xGPU = thrust::device_pointer_cast(xarray); 1518 bGPU = thrust::device_pointer_cast(barray); 1519 1520 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1521 /* First, reorder with the row permutation */ 1522 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1523 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1524 tempGPU->begin()); 1525 1526 /* Next, solve L */ 1527 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1528 loTriFactor->csrMat->num_rows, 1529 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1530 loTriFactor->csrMat->num_entries, 1531 #endif 1532 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1533 loTriFactor->csrMat->values->data().get(), 1534 loTriFactor->csrMat->row_offsets->data().get(), 1535 loTriFactor->csrMat->column_indices->data().get(), 1536 loTriFactor->solveInfo, 1537 tempGPU->data().get(), xarray 1538 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1539 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1540 #endif 1541 );CHKERRCUSPARSE(stat); 1542 1543 /* Then, solve U */ 1544 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1545 upTriFactor->csrMat->num_rows, 1546 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1547 upTriFactor->csrMat->num_entries, 1548 #endif 1549 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1550 upTriFactor->csrMat->values->data().get(), 1551 upTriFactor->csrMat->row_offsets->data().get(), 1552 upTriFactor->csrMat->column_indices->data().get(), 1553 upTriFactor->solveInfo, 1554 xarray, tempGPU->data().get() 1555 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1556 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1557 #endif 1558 );CHKERRCUSPARSE(stat); 1559 1560 /* Last, reorder with the column permutation */ 1561 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1562 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1563 xGPU); 1564 1565 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1566 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1567 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1568 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1569 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1570 PetscFunctionReturn(0); 1571 } 1572 1573 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1574 { 1575 const PetscScalar *barray; 1576 PetscScalar *xarray; 1577 cusparseStatus_t stat; 1578 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1579 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1580 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1581 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1582 PetscErrorCode ierr; 1583 cudaError_t cerr; 1584 1585 PetscFunctionBegin; 1586 /* Get the GPU pointers */ 1587 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1588 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1589 1590 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1591 /* First, solve L */ 1592 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1593 loTriFactor->csrMat->num_rows, 1594 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1595 loTriFactor->csrMat->num_entries, 1596 #endif 1597 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1598 loTriFactor->csrMat->values->data().get(), 1599 loTriFactor->csrMat->row_offsets->data().get(), 1600 loTriFactor->csrMat->column_indices->data().get(), 1601 loTriFactor->solveInfo, 1602 barray, tempGPU->data().get() 1603 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1604 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1605 #endif 1606 );CHKERRCUSPARSE(stat); 1607 1608 /* Next, solve U */ 1609 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1610 upTriFactor->csrMat->num_rows, 1611 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1612 upTriFactor->csrMat->num_entries, 1613 #endif 1614 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1615 upTriFactor->csrMat->values->data().get(), 1616 upTriFactor->csrMat->row_offsets->data().get(), 1617 upTriFactor->csrMat->column_indices->data().get(), 1618 upTriFactor->solveInfo, 1619 tempGPU->data().get(), xarray 1620 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1621 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1622 #endif 1623 );CHKERRCUSPARSE(stat); 1624 1625 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1626 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1627 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1628 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1629 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1630 PetscFunctionReturn(0); 1631 } 1632 1633 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1634 { 1635 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1636 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1637 cudaError_t cerr; 1638 PetscErrorCode ierr; 1639 1640 PetscFunctionBegin; 1641 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1642 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1643 1644 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1645 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1646 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1647 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1648 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1649 A->offloadmask = PETSC_OFFLOAD_BOTH; 1650 } 1651 PetscFunctionReturn(0); 1652 } 1653 1654 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1655 { 1656 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1657 PetscErrorCode ierr; 1658 1659 PetscFunctionBegin; 1660 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1661 *array = a->a; 1662 A->offloadmask = PETSC_OFFLOAD_CPU; 1663 PetscFunctionReturn(0); 1664 } 1665 1666 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1667 { 1668 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1669 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1670 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1671 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1672 PetscErrorCode ierr; 1673 cusparseStatus_t stat; 1674 cudaError_t err; 1675 1676 PetscFunctionBegin; 1677 if (A->boundtocpu) PetscFunctionReturn(0); 1678 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1679 if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1680 /* Copy values only */ 1681 CsrMatrix *matrix,*matrixT; 1682 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1683 1684 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1685 matrix->values->assign(a->a, a->a+a->nz); 1686 err = WaitForCUDA();CHKERRCUDA(err); 1687 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1688 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1689 1690 /* Update matT when it was built before */ 1691 if (cusparsestruct->matTranspose) { 1692 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1693 matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1694 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1695 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1696 A->cmap->n, matrix->num_entries, 1697 matrix->values->data().get(), 1698 cusparsestruct->rowoffsets_gpu->data().get(), 1699 matrix->column_indices->data().get(), 1700 matrixT->values->data().get(), 1701 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1702 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1703 CUSPARSE_ACTION_NUMERIC,indexBase, 1704 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1705 #else 1706 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1707 CUSPARSE_ACTION_NUMERIC, indexBase 1708 #endif 1709 );CHKERRCUSPARSE(stat); 1710 err = WaitForCUDA();CHKERRCUDA(err); 1711 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1712 } 1713 } else { 1714 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1715 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1716 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1717 delete cusparsestruct->workVector; 1718 delete cusparsestruct->rowoffsets_gpu; 1719 try { 1720 if (a->compressedrow.use) { 1721 m = a->compressedrow.nrows; 1722 ii = a->compressedrow.i; 1723 ridx = a->compressedrow.rindex; 1724 } else { 1725 m = A->rmap->n; 1726 ii = a->i; 1727 ridx = NULL; 1728 } 1729 cusparsestruct->nrows = m; 1730 1731 /* create cusparse matrix */ 1732 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1733 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1734 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1735 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1736 1737 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1738 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1739 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1740 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1741 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1742 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1743 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1744 1745 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1746 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1747 /* set the matrix */ 1748 CsrMatrix *mat= new CsrMatrix; 1749 mat->num_rows = m; 1750 mat->num_cols = A->cmap->n; 1751 mat->num_entries = a->nz; 1752 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1753 mat->row_offsets->assign(ii, ii + m+1); 1754 1755 mat->column_indices = new THRUSTINTARRAY32(a->nz); 1756 mat->column_indices->assign(a->j, a->j+a->nz); 1757 1758 mat->values = new THRUSTARRAY(a->nz); 1759 mat->values->assign(a->a, a->a+a->nz); 1760 1761 /* assign the pointer */ 1762 matstruct->mat = mat; 1763 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1764 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1765 stat = cusparseCreateCsr(&matstruct->matDescr, 1766 mat->num_rows, mat->num_cols, mat->num_entries, 1767 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1768 mat->values->data().get(), 1769 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1770 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1771 } 1772 #endif 1773 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1774 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1775 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1776 #else 1777 CsrMatrix *mat= new CsrMatrix; 1778 mat->num_rows = m; 1779 mat->num_cols = A->cmap->n; 1780 mat->num_entries = a->nz; 1781 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1782 mat->row_offsets->assign(ii, ii + m+1); 1783 1784 mat->column_indices = new THRUSTINTARRAY32(a->nz); 1785 mat->column_indices->assign(a->j, a->j+a->nz); 1786 1787 mat->values = new THRUSTARRAY(a->nz); 1788 mat->values->assign(a->a, a->a+a->nz); 1789 1790 cusparseHybMat_t hybMat; 1791 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1792 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1793 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1794 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1795 matstruct->descr, mat->values->data().get(), 1796 mat->row_offsets->data().get(), 1797 mat->column_indices->data().get(), 1798 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1799 /* assign the pointer */ 1800 matstruct->mat = hybMat; 1801 1802 if (mat) { 1803 if (mat->values) delete (THRUSTARRAY*)mat->values; 1804 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1805 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1806 delete (CsrMatrix*)mat; 1807 } 1808 #endif 1809 } 1810 1811 /* assign the compressed row indices */ 1812 if (a->compressedrow.use) { 1813 cusparsestruct->workVector = new THRUSTARRAY(m); 1814 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1815 matstruct->cprowIndices->assign(ridx,ridx+m); 1816 tmp = m; 1817 } else { 1818 cusparsestruct->workVector = NULL; 1819 matstruct->cprowIndices = NULL; 1820 tmp = 0; 1821 } 1822 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1823 1824 /* assign the pointer */ 1825 cusparsestruct->mat = matstruct; 1826 } catch(char *ex) { 1827 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1828 } 1829 err = WaitForCUDA();CHKERRCUDA(err); 1830 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1831 cusparsestruct->nonzerostate = A->nonzerostate; 1832 } 1833 A->offloadmask = PETSC_OFFLOAD_BOTH; 1834 } 1835 PetscFunctionReturn(0); 1836 } 1837 1838 struct VecCUDAPlusEquals 1839 { 1840 template <typename Tuple> 1841 __host__ __device__ 1842 void operator()(Tuple t) 1843 { 1844 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1845 } 1846 }; 1847 1848 struct VecCUDAEquals 1849 { 1850 template <typename Tuple> 1851 __host__ __device__ 1852 void operator()(Tuple t) 1853 { 1854 thrust::get<1>(t) = thrust::get<0>(t); 1855 } 1856 }; 1857 1858 struct VecCUDAEqualsReverse 1859 { 1860 template <typename Tuple> 1861 __host__ __device__ 1862 void operator()(Tuple t) 1863 { 1864 thrust::get<0>(t) = thrust::get<1>(t); 1865 } 1866 }; 1867 1868 struct MatMatCusparse { 1869 PetscBool cisdense; 1870 PetscScalar *Bt; 1871 Mat X; 1872 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1873 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1874 cusparseDnMatDescr_t matBDescr; 1875 cusparseDnMatDescr_t matCDescr; 1876 size_t spmmBufferSize; 1877 void *spmmBuffer; 1878 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1879 #endif 1880 }; 1881 1882 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1883 { 1884 PetscErrorCode ierr; 1885 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1886 cudaError_t cerr; 1887 1888 PetscFunctionBegin; 1889 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1890 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1891 cusparseStatus_t stat; 1892 if (mmdata->matBDescr) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);} 1893 if (mmdata->matCDescr) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);} 1894 if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 1895 #endif 1896 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1897 ierr = PetscFree(data);CHKERRQ(ierr); 1898 PetscFunctionReturn(0); 1899 } 1900 1901 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1902 1903 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1904 { 1905 Mat_Product *product = C->product; 1906 Mat A,B; 1907 PetscInt m,n,blda,clda; 1908 PetscBool flg,biscuda; 1909 Mat_SeqAIJCUSPARSE *cusp; 1910 cusparseStatus_t stat; 1911 cusparseOperation_t opA; 1912 const PetscScalar *barray; 1913 PetscScalar *carray; 1914 PetscErrorCode ierr; 1915 MatMatCusparse *mmdata; 1916 Mat_SeqAIJCUSPARSEMultStruct *mat; 1917 CsrMatrix *csrmat; 1918 cudaError_t cerr; 1919 1920 PetscFunctionBegin; 1921 MatCheckProduct(C,1); 1922 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1923 mmdata = (MatMatCusparse*)product->data; 1924 A = product->A; 1925 B = product->B; 1926 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1927 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1928 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1929 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1930 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1931 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1932 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1933 switch (product->type) { 1934 case MATPRODUCT_AB: 1935 case MATPRODUCT_PtAP: 1936 mat = cusp->mat; 1937 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1938 m = A->rmap->n; 1939 n = B->cmap->n; 1940 break; 1941 case MATPRODUCT_AtB: 1942 if (!cusp->transgen) { 1943 mat = cusp->mat; 1944 opA = CUSPARSE_OPERATION_TRANSPOSE; 1945 } else { 1946 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1947 mat = cusp->matTranspose; 1948 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1949 } 1950 m = A->cmap->n; 1951 n = B->cmap->n; 1952 break; 1953 case MATPRODUCT_ABt: 1954 case MATPRODUCT_RARt: 1955 mat = cusp->mat; 1956 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1957 m = A->rmap->n; 1958 n = B->rmap->n; 1959 break; 1960 default: 1961 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1962 } 1963 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1964 csrmat = (CsrMatrix*)mat->mat; 1965 /* if the user passed a CPU matrix, copy the data to the GPU */ 1966 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1967 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 1968 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 1969 1970 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 1971 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1972 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1973 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 1974 } else { 1975 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 1976 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 1977 } 1978 1979 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1980 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1981 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 1982 /* (re)allcoate spmmBuffer if not initialized or LDAs are different */ 1983 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 1984 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 1985 if (!mmdata->matBDescr) { 1986 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 1987 mmdata->Blda = blda; 1988 } 1989 1990 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 1991 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 1992 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 1993 mmdata->Clda = clda; 1994 } 1995 1996 if (!mat->matDescr) { 1997 stat = cusparseCreateCsr(&mat->matDescr, 1998 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 1999 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2000 csrmat->values->data().get(), 2001 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2002 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2003 } 2004 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2005 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2006 mmdata->matCDescr,cusparse_scalartype, 2007 cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat); 2008 if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 2009 cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr); 2010 mmdata->initialized = PETSC_TRUE; 2011 } else { 2012 /* to be safe, always update pointers of the mats */ 2013 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2014 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2015 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2016 } 2017 2018 /* do cusparseSpMM, which supports transpose on B */ 2019 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2020 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2021 mmdata->matCDescr,cusparse_scalartype, 2022 cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat); 2023 #else 2024 PetscInt k; 2025 /* cusparseXcsrmm does not support transpose on B */ 2026 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2027 cublasHandle_t cublasv2handle; 2028 cublasStatus_t cerr; 2029 2030 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2031 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2032 B->cmap->n,B->rmap->n, 2033 &PETSC_CUSPARSE_ONE ,barray,blda, 2034 &PETSC_CUSPARSE_ZERO,barray,blda, 2035 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2036 blda = B->cmap->n; 2037 k = B->cmap->n; 2038 } else { 2039 k = B->rmap->n; 2040 } 2041 2042 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2043 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2044 csrmat->num_entries,mat->alpha_one,mat->descr, 2045 csrmat->values->data().get(), 2046 csrmat->row_offsets->data().get(), 2047 csrmat->column_indices->data().get(), 2048 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2049 carray,clda);CHKERRCUSPARSE(stat); 2050 #endif 2051 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2052 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2053 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2054 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2055 if (product->type == MATPRODUCT_RARt) { 2056 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2057 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2058 } else if (product->type == MATPRODUCT_PtAP) { 2059 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2060 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2061 } else { 2062 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2063 } 2064 if (mmdata->cisdense) { 2065 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2066 } 2067 if (!biscuda) { 2068 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2069 } 2070 PetscFunctionReturn(0); 2071 } 2072 2073 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2074 { 2075 Mat_Product *product = C->product; 2076 Mat A,B; 2077 PetscInt m,n; 2078 PetscBool cisdense,flg; 2079 PetscErrorCode ierr; 2080 MatMatCusparse *mmdata; 2081 Mat_SeqAIJCUSPARSE *cusp; 2082 2083 PetscFunctionBegin; 2084 MatCheckProduct(C,1); 2085 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2086 A = product->A; 2087 B = product->B; 2088 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2089 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2090 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2091 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2092 switch (product->type) { 2093 case MATPRODUCT_AB: 2094 m = A->rmap->n; 2095 n = B->cmap->n; 2096 break; 2097 case MATPRODUCT_AtB: 2098 m = A->cmap->n; 2099 n = B->cmap->n; 2100 break; 2101 case MATPRODUCT_ABt: 2102 m = A->rmap->n; 2103 n = B->rmap->n; 2104 break; 2105 case MATPRODUCT_PtAP: 2106 m = B->cmap->n; 2107 n = B->cmap->n; 2108 break; 2109 case MATPRODUCT_RARt: 2110 m = B->rmap->n; 2111 n = B->rmap->n; 2112 break; 2113 default: 2114 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2115 } 2116 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2117 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2118 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2119 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2120 2121 /* product data */ 2122 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2123 mmdata->cisdense = cisdense; 2124 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2125 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2126 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2127 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2128 } 2129 #endif 2130 /* for these products we need intermediate storage */ 2131 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2132 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2133 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2134 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2135 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2136 } else { 2137 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2138 } 2139 } 2140 C->product->data = mmdata; 2141 C->product->destroy = MatDestroy_MatMatCusparse; 2142 2143 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2144 PetscFunctionReturn(0); 2145 } 2146 2147 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2148 2149 /* handles dense B */ 2150 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C) 2151 { 2152 Mat_Product *product = C->product; 2153 PetscErrorCode ierr; 2154 2155 PetscFunctionBegin; 2156 MatCheckProduct(C,1); 2157 if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 2158 if (product->A->boundtocpu) { 2159 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr); 2160 PetscFunctionReturn(0); 2161 } 2162 switch (product->type) { 2163 case MATPRODUCT_AB: 2164 case MATPRODUCT_AtB: 2165 case MATPRODUCT_ABt: 2166 case MATPRODUCT_PtAP: 2167 case MATPRODUCT_RARt: 2168 C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2169 default: 2170 break; 2171 } 2172 PetscFunctionReturn(0); 2173 } 2174 2175 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2176 { 2177 PetscErrorCode ierr; 2178 2179 PetscFunctionBegin; 2180 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2181 PetscFunctionReturn(0); 2182 } 2183 2184 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2185 { 2186 PetscErrorCode ierr; 2187 2188 PetscFunctionBegin; 2189 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2190 PetscFunctionReturn(0); 2191 } 2192 2193 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2194 { 2195 PetscErrorCode ierr; 2196 2197 PetscFunctionBegin; 2198 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2199 PetscFunctionReturn(0); 2200 } 2201 2202 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2203 { 2204 PetscErrorCode ierr; 2205 2206 PetscFunctionBegin; 2207 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2208 PetscFunctionReturn(0); 2209 } 2210 2211 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2212 { 2213 PetscErrorCode ierr; 2214 2215 PetscFunctionBegin; 2216 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2217 PetscFunctionReturn(0); 2218 } 2219 2220 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2221 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2222 { 2223 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2224 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2225 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2226 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2227 PetscErrorCode ierr; 2228 cudaError_t cerr; 2229 cusparseStatus_t stat; 2230 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2231 PetscBool compressed; 2232 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2233 PetscInt nx,ny; 2234 #endif 2235 2236 PetscFunctionBegin; 2237 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2238 if (!a->nonzerorowcnt) { 2239 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2240 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2241 PetscFunctionReturn(0); 2242 } 2243 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2244 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2245 if (!trans) { 2246 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2247 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2248 } else { 2249 if (herm || !cusparsestruct->transgen) { 2250 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2251 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2252 } else { 2253 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);} 2254 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2255 } 2256 } 2257 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2258 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2259 2260 try { 2261 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2262 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2263 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2264 2265 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2266 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2267 /* z = A x + beta y. 2268 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2269 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2270 */ 2271 xptr = xarray; 2272 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2273 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2274 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2275 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2276 allocated to accommodate different uses. So we get the length info directly from mat. 2277 */ 2278 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2279 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2280 nx = mat->num_cols; 2281 ny = mat->num_rows; 2282 } 2283 #endif 2284 } else { 2285 /* z = A^T x + beta y 2286 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2287 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2288 */ 2289 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2290 dptr = zarray; 2291 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2292 if (compressed) { /* Scatter x to work vector */ 2293 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2294 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2295 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2296 VecCUDAEqualsReverse()); 2297 } 2298 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2299 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2300 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2301 nx = mat->num_rows; 2302 ny = mat->num_cols; 2303 } 2304 #endif 2305 } 2306 2307 /* csr_spmv does y = alpha op(A) x + beta y */ 2308 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2309 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2310 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2311 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2312 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2313 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2314 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2315 matstruct->matDescr, 2316 matstruct->cuSpMV[opA].vecXDescr, beta, 2317 matstruct->cuSpMV[opA].vecYDescr, 2318 cusparse_scalartype, 2319 cusparsestruct->spmvAlg, 2320 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2321 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2322 2323 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2324 } else { 2325 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2326 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2327 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2328 } 2329 2330 stat = cusparseSpMV(cusparsestruct->handle, opA, 2331 matstruct->alpha_one, 2332 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */ 2333 matstruct->cuSpMV[opA].vecXDescr, 2334 beta, 2335 matstruct->cuSpMV[opA].vecYDescr, 2336 cusparse_scalartype, 2337 cusparsestruct->spmvAlg, 2338 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2339 #else 2340 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2341 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2342 mat->num_rows, mat->num_cols, 2343 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2344 mat->values->data().get(), mat->row_offsets->data().get(), 2345 mat->column_indices->data().get(), xptr, beta, 2346 dptr);CHKERRCUSPARSE(stat); 2347 #endif 2348 } else { 2349 if (cusparsestruct->nrows) { 2350 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2351 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2352 #else 2353 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2354 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2355 matstruct->alpha_one, matstruct->descr, hybMat, 2356 xptr, beta, 2357 dptr);CHKERRCUSPARSE(stat); 2358 #endif 2359 } 2360 } 2361 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2362 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2363 2364 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2365 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2366 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2367 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2368 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2369 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2370 } 2371 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2372 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2373 } 2374 2375 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2376 if (compressed) { 2377 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2378 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2379 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2380 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2381 VecCUDAPlusEquals()); 2382 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2383 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2384 } 2385 } else { 2386 if (yy && yy != zz) { 2387 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2388 } 2389 } 2390 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2391 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2392 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2393 } catch(char *ex) { 2394 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2395 } 2396 if (yy) { 2397 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2398 } else { 2399 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2400 } 2401 PetscFunctionReturn(0); 2402 } 2403 2404 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2405 { 2406 PetscErrorCode ierr; 2407 2408 PetscFunctionBegin; 2409 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2410 PetscFunctionReturn(0); 2411 } 2412 2413 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2414 { 2415 PetscErrorCode ierr; 2416 PetscSplitCSRDataStructure *d_mat = NULL, h_mat; 2417 PetscBool is_seq = PETSC_TRUE; 2418 PetscInt nnz_state = A->nonzerostate; 2419 PetscFunctionBegin; 2420 if (A->factortype == MAT_FACTOR_NONE) { 2421 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2422 } 2423 if (d_mat) { 2424 cudaError_t err; 2425 ierr = PetscInfo(A,"Assemble device matrix\n");CHKERRQ(ierr); 2426 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2427 nnz_state = h_mat.nonzerostate; 2428 is_seq = h_mat.seq; 2429 } 2430 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2431 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2432 if (A->factortype == MAT_FACTOR_NONE && A->nonzerostate >= nnz_state && is_seq) { // assembled on CPU eventhough equiped for GPU 2433 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2434 } else if (nnz_state > A->nonzerostate) { 2435 A->offloadmask = PETSC_OFFLOAD_GPU; 2436 } 2437 2438 PetscFunctionReturn(0); 2439 } 2440 2441 /* --------------------------------------------------------------------------------*/ 2442 /*@ 2443 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2444 (the default parallel PETSc format). This matrix will ultimately pushed down 2445 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2446 assembly performance the user should preallocate the matrix storage by setting 2447 the parameter nz (or the array nnz). By setting these parameters accurately, 2448 performance during matrix assembly can be increased by more than a factor of 50. 2449 2450 Collective 2451 2452 Input Parameters: 2453 + comm - MPI communicator, set to PETSC_COMM_SELF 2454 . m - number of rows 2455 . n - number of columns 2456 . nz - number of nonzeros per row (same for all rows) 2457 - nnz - array containing the number of nonzeros in the various rows 2458 (possibly different for each row) or NULL 2459 2460 Output Parameter: 2461 . A - the matrix 2462 2463 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2464 MatXXXXSetPreallocation() paradgm instead of this routine directly. 2465 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 2466 2467 Notes: 2468 If nnz is given then nz is ignored 2469 2470 The AIJ format (also called the Yale sparse matrix format or 2471 compressed row storage), is fully compatible with standard Fortran 77 2472 storage. That is, the stored row and column indices can begin at 2473 either one (as in Fortran) or zero. See the users' manual for details. 2474 2475 Specify the preallocated storage with either nz or nnz (not both). 2476 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 2477 allocation. For large problems you MUST preallocate memory or you 2478 will get TERRIBLE performance, see the users' manual chapter on matrices. 2479 2480 By default, this format uses inodes (identical nodes) when possible, to 2481 improve numerical efficiency of matrix-vector products and solves. We 2482 search for consecutive rows with the same nonzero structure, thereby 2483 reusing matrix information to achieve increased efficiency. 2484 2485 Level: intermediate 2486 2487 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 2488 @*/ 2489 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 2490 { 2491 PetscErrorCode ierr; 2492 2493 PetscFunctionBegin; 2494 ierr = MatCreate(comm,A);CHKERRQ(ierr); 2495 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 2496 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2497 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 2498 PetscFunctionReturn(0); 2499 } 2500 2501 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 2502 { 2503 PetscErrorCode ierr; 2504 PetscSplitCSRDataStructure *d_mat = NULL; 2505 2506 PetscFunctionBegin; 2507 if (A->factortype == MAT_FACTOR_NONE) { 2508 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2509 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 2510 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 2511 } else { 2512 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 2513 } 2514 if (d_mat) { 2515 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2516 cudaError_t err; 2517 PetscSplitCSRDataStructure h_mat; 2518 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 2519 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2520 if (h_mat.seq) { 2521 if (a->compressedrow.use) { 2522 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 2523 } 2524 err = cudaFree(d_mat);CHKERRCUDA(err); 2525 } 2526 } 2527 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 2528 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2529 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 2530 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 2531 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 2532 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 2533 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 2534 PetscFunctionReturn(0); 2535 } 2536 2537 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 2538 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 2539 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 2540 { 2541 PetscErrorCode ierr; 2542 2543 PetscFunctionBegin; 2544 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 2545 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 2546 PetscFunctionReturn(0); 2547 } 2548 2549 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 2550 { 2551 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2552 PetscErrorCode ierr; 2553 2554 PetscFunctionBegin; 2555 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 2556 if (flg) { 2557 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 2558 2559 A->ops->mult = MatMult_SeqAIJ; 2560 A->ops->multadd = MatMultAdd_SeqAIJ; 2561 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 2562 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 2563 A->ops->multhermitiantranspose = NULL; 2564 A->ops->multhermitiantransposeadd = NULL; 2565 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2566 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 2567 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 2568 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 2569 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 2570 } else { 2571 A->ops->mult = MatMult_SeqAIJCUSPARSE; 2572 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 2573 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 2574 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 2575 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 2576 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 2577 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2578 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2579 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 2580 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 2581 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 2582 } 2583 A->boundtocpu = flg; 2584 a->inode.use = flg; 2585 PetscFunctionReturn(0); 2586 } 2587 2588 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 2589 { 2590 PetscSplitCSRDataStructure *d_mat = NULL; 2591 PetscErrorCode ierr; 2592 PetscBool both = PETSC_FALSE; 2593 2594 PetscFunctionBegin; 2595 if (A->factortype == MAT_FACTOR_NONE) { 2596 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 2597 if (spptr->mat) { 2598 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 2599 if (matrix->values) { 2600 both = PETSC_TRUE; 2601 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 2602 } 2603 } 2604 if (spptr->matTranspose) { 2605 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 2606 if (matrix->values) { 2607 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 2608 } 2609 } 2610 d_mat = spptr->deviceMat; 2611 } 2612 if (d_mat) { 2613 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2614 PetscInt n = A->rmap->n, nnz = a->i[n]; 2615 cudaError_t err; 2616 PetscScalar *vals; 2617 ierr = PetscInfo(A,"Zero device matrix\n");CHKERRQ(ierr); 2618 err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2619 err = cudaMemset( vals, 0, (nnz)*sizeof(PetscScalar));CHKERRCUDA(err); 2620 } 2621 ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 2622 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2623 2624 PetscFunctionReturn(0); 2625 } 2626 2627 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 2628 { 2629 PetscErrorCode ierr; 2630 cusparseStatus_t stat; 2631 Mat B; 2632 2633 PetscFunctionBegin; 2634 if (reuse == MAT_INITIAL_MATRIX) { 2635 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 2636 } else if (reuse == MAT_REUSE_MATRIX) { 2637 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 2638 } 2639 B = *newmat; 2640 2641 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 2642 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 2643 2644 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 2645 if (B->factortype == MAT_FACTOR_NONE) { 2646 Mat_SeqAIJCUSPARSE *spptr; 2647 2648 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2649 spptr->format = MAT_CUSPARSE_CSR; 2650 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2651 B->spptr = spptr; 2652 spptr->deviceMat = NULL; 2653 } else { 2654 Mat_SeqAIJCUSPARSETriFactors *spptr; 2655 2656 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2657 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2658 B->spptr = spptr; 2659 } 2660 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2661 } 2662 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 2663 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 2664 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 2665 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 2666 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 2667 B->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 2668 2669 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 2670 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2671 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 2672 PetscFunctionReturn(0); 2673 } 2674 2675 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 2676 { 2677 PetscErrorCode ierr; 2678 2679 PetscFunctionBegin; 2680 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 2681 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 2682 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2683 ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr); 2684 ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr); 2685 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2686 PetscFunctionReturn(0); 2687 } 2688 2689 /*MC 2690 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 2691 2692 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 2693 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 2694 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 2695 2696 Options Database Keys: 2697 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 2698 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2699 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2700 2701 Level: beginner 2702 2703 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 2704 M*/ 2705 2706 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 2707 2708 2709 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 2710 { 2711 PetscErrorCode ierr; 2712 2713 PetscFunctionBegin; 2714 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2715 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2716 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2717 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2718 PetscFunctionReturn(0); 2719 } 2720 2721 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 2722 { 2723 PetscErrorCode ierr; 2724 cusparseStatus_t stat; 2725 2726 PetscFunctionBegin; 2727 if (*cusparsestruct) { 2728 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 2729 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 2730 delete (*cusparsestruct)->workVector; 2731 delete (*cusparsestruct)->rowoffsets_gpu; 2732 delete (*cusparsestruct)->cooPerm; 2733 delete (*cusparsestruct)->cooPerm_a; 2734 delete (*cusparsestruct)->cooPerm_v; 2735 delete (*cusparsestruct)->cooPerm_w; 2736 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 2737 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2738 cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr); 2739 #endif 2740 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 2741 } 2742 PetscFunctionReturn(0); 2743 } 2744 2745 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 2746 { 2747 PetscFunctionBegin; 2748 if (*mat) { 2749 delete (*mat)->values; 2750 delete (*mat)->column_indices; 2751 delete (*mat)->row_offsets; 2752 delete *mat; 2753 *mat = 0; 2754 } 2755 PetscFunctionReturn(0); 2756 } 2757 2758 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 2759 { 2760 cusparseStatus_t stat; 2761 PetscErrorCode ierr; 2762 2763 PetscFunctionBegin; 2764 if (*trifactor) { 2765 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 2766 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 2767 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 2768 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 2769 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2770 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 2771 #endif 2772 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 2773 } 2774 PetscFunctionReturn(0); 2775 } 2776 2777 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 2778 { 2779 CsrMatrix *mat; 2780 cusparseStatus_t stat; 2781 cudaError_t err; 2782 2783 PetscFunctionBegin; 2784 if (*matstruct) { 2785 if ((*matstruct)->mat) { 2786 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 2787 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2788 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2789 #else 2790 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 2791 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 2792 #endif 2793 } else { 2794 mat = (CsrMatrix*)(*matstruct)->mat; 2795 CsrMatrix_Destroy(&mat); 2796 } 2797 } 2798 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 2799 delete (*matstruct)->cprowIndices; 2800 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 2801 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 2802 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 2803 2804 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2805 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 2806 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 2807 for (int i=0; i<3; i++) { 2808 if (mdata->cuSpMV[i].initialized) { 2809 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 2810 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 2811 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 2812 } 2813 } 2814 #endif 2815 delete *matstruct; 2816 *matstruct = NULL; 2817 } 2818 PetscFunctionReturn(0); 2819 } 2820 2821 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2822 { 2823 PetscErrorCode ierr; 2824 2825 PetscFunctionBegin; 2826 if (*trifactors) { 2827 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 2828 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 2829 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 2830 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 2831 delete (*trifactors)->rpermIndices; 2832 delete (*trifactors)->cpermIndices; 2833 delete (*trifactors)->workVector; 2834 (*trifactors)->rpermIndices = NULL; 2835 (*trifactors)->cpermIndices = NULL; 2836 (*trifactors)->workVector = NULL; 2837 } 2838 PetscFunctionReturn(0); 2839 } 2840 2841 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2842 { 2843 PetscErrorCode ierr; 2844 cusparseHandle_t handle; 2845 cusparseStatus_t stat; 2846 2847 PetscFunctionBegin; 2848 if (*trifactors) { 2849 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 2850 if (handle = (*trifactors)->handle) { 2851 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 2852 } 2853 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 2854 } 2855 PetscFunctionReturn(0); 2856 } 2857 2858 struct IJCompare 2859 { 2860 __host__ __device__ 2861 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 2862 { 2863 if (t1.get<0>() < t2.get<0>()) return true; 2864 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 2865 return false; 2866 } 2867 }; 2868 2869 struct IJEqual 2870 { 2871 __host__ __device__ 2872 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 2873 { 2874 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 2875 return true; 2876 } 2877 }; 2878 2879 struct IJDiff 2880 { 2881 __host__ __device__ 2882 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 2883 { 2884 return t1 == t2 ? 0 : 1; 2885 } 2886 }; 2887 2888 struct IJSum 2889 { 2890 __host__ __device__ 2891 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 2892 { 2893 return t1||t2; 2894 } 2895 }; 2896 2897 #include <thrust/iterator/discard_iterator.h> 2898 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 2899 { 2900 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2901 CsrMatrix *matrix; 2902 PetscErrorCode ierr; 2903 cudaError_t cerr; 2904 PetscInt n; 2905 2906 PetscFunctionBegin; 2907 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 2908 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 2909 if (!cusp->cooPerm) { 2910 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2911 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2912 PetscFunctionReturn(0); 2913 } 2914 n = cusp->cooPerm->size(); 2915 matrix = (CsrMatrix*)cusp->mat->mat; 2916 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 2917 if (!cusp->cooPerm_v) { cusp->cooPerm_v = new THRUSTARRAY(n); } 2918 ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 2919 if (v) { 2920 cusp->cooPerm_v->assign(v,v+n); 2921 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 2922 } 2923 else thrust::fill(thrust::device,cusp->cooPerm_v->begin(),cusp->cooPerm_v->end(),0.); 2924 if (imode == ADD_VALUES) { 2925 if (cusp->cooPerm_a) { 2926 if (!cusp->cooPerm_w) cusp->cooPerm_w = new THRUSTARRAY(matrix->values->size()); 2927 auto vbit = thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()); 2928 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cusp->cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 2929 thrust::transform(cusp->cooPerm_w->begin(),cusp->cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 2930 } else { 2931 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 2932 matrix->values->begin())); 2933 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 2934 matrix->values->end())); 2935 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 2936 } 2937 } else { 2938 if (cusp->cooPerm_a) { /* non unique values insertion, result is undefined (we cannot guarantee last takes precedence) 2939 if we are inserting two different values into the same location */ 2940 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 2941 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->begin()))); 2942 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 2943 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->end()))); 2944 thrust::for_each(zibit,zieit,VecCUDAEquals()); 2945 } else { 2946 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 2947 matrix->values->begin())); 2948 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 2949 matrix->values->end())); 2950 thrust::for_each(zibit,zieit,VecCUDAEquals()); 2951 } 2952 } 2953 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2954 ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 2955 A->offloadmask = PETSC_OFFLOAD_GPU; 2956 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2957 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2958 /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */ 2959 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 2960 PetscFunctionReturn(0); 2961 } 2962 2963 #include <thrust/binary_search.h> 2964 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 2965 { 2966 PetscErrorCode ierr; 2967 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2968 CsrMatrix *matrix; 2969 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2970 PetscInt cooPerm_n, nzr = 0; 2971 cudaError_t cerr; 2972 2973 PetscFunctionBegin; 2974 ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr); 2975 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 2976 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 2977 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 2978 if (n != cooPerm_n) { 2979 delete cusp->cooPerm; 2980 delete cusp->cooPerm_v; 2981 delete cusp->cooPerm_w; 2982 delete cusp->cooPerm_a; 2983 cusp->cooPerm = NULL; 2984 cusp->cooPerm_v = NULL; 2985 cusp->cooPerm_w = NULL; 2986 cusp->cooPerm_a = NULL; 2987 } 2988 if (n) { 2989 THRUSTINTARRAY d_i(n); 2990 THRUSTINTARRAY d_j(n); 2991 THRUSTINTARRAY ii(A->rmap->n); 2992 2993 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 2994 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 2995 2996 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 2997 d_i.assign(coo_i,coo_i+n); 2998 d_j.assign(coo_j,coo_j+n); 2999 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3000 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3001 3002 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3003 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 3004 *cusp->cooPerm_a = d_i; 3005 THRUSTINTARRAY w = d_j; 3006 3007 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 3008 if (nekey == ekey) { /* all entries are unique */ 3009 delete cusp->cooPerm_a; 3010 cusp->cooPerm_a = NULL; 3011 } else { /* I couldn't come up with a more elegant algorithm */ 3012 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 3013 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 3014 (*cusp->cooPerm_a)[0] = 0; 3015 w[0] = 0; 3016 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 3017 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 3018 } 3019 thrust::counting_iterator<PetscInt> search_begin(0); 3020 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 3021 search_begin, search_begin + A->rmap->n, 3022 ii.begin()); 3023 3024 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3025 a->singlemalloc = PETSC_FALSE; 3026 a->free_a = PETSC_TRUE; 3027 a->free_ij = PETSC_TRUE; 3028 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3029 a->i[0] = 0; 3030 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3031 a->nz = a->maxnz = a->i[A->rmap->n]; 3032 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3033 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3034 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3035 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3036 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3037 for (PetscInt i = 0; i < A->rmap->n; i++) { 3038 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3039 nzr += (PetscInt)!!(nnzr); 3040 a->ilen[i] = a->imax[i] = nnzr; 3041 } 3042 A->preallocated = PETSC_TRUE; 3043 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3044 } else { 3045 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3046 } 3047 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3048 ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr); 3049 3050 /* We want to allocate the CUSPARSE struct for matvec now. 3051 The code is so convoluted now that I prefer to copy garbage to the GPU */ 3052 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3053 A->offloadmask = PETSC_OFFLOAD_CPU; 3054 A->nonzerostate++; 3055 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3056 { 3057 matrix = (CsrMatrix*)cusp->mat->mat; 3058 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3059 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3060 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3061 } 3062 3063 A->offloadmask = PETSC_OFFLOAD_CPU; 3064 A->assembled = PETSC_FALSE; 3065 A->was_assembled = PETSC_FALSE; 3066 PetscFunctionReturn(0); 3067 } 3068