1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 68 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 69 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 70 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 71 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 72 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 73 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 74 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 75 76 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 82 83 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 85 86 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 87 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 88 89 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 90 { 91 cusparseStatus_t stat; 92 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 93 94 PetscFunctionBegin; 95 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 96 cusparsestruct->stream = stream; 97 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 98 PetscFunctionReturn(0); 99 } 100 101 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 102 { 103 cusparseStatus_t stat; 104 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 105 106 PetscFunctionBegin; 107 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 108 if (cusparsestruct->handle != handle) { 109 if (cusparsestruct->handle) { 110 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 111 } 112 cusparsestruct->handle = handle; 113 } 114 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 115 PetscFunctionReturn(0); 116 } 117 118 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 119 { 120 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 121 PetscBool flg; 122 PetscErrorCode ierr; 123 124 PetscFunctionBegin; 125 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 126 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 127 if (cusparsestruct->handle) cusparsestruct->handle = 0; 128 PetscFunctionReturn(0); 129 } 130 131 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 132 { 133 PetscFunctionBegin; 134 *type = MATSOLVERCUSPARSE; 135 PetscFunctionReturn(0); 136 } 137 138 /*MC 139 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 140 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 141 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 142 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 143 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 144 algorithms are not recommended. This class does NOT support direct solver operations. 145 146 Level: beginner 147 148 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 149 M*/ 150 151 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 152 { 153 PetscErrorCode ierr; 154 PetscInt n = A->rmap->n; 155 156 PetscFunctionBegin; 157 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 158 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 159 (*B)->factortype = ftype; 160 (*B)->useordering = PETSC_TRUE; 161 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 162 163 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 164 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 165 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 166 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 167 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 168 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 169 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 170 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 171 172 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 173 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 174 PetscFunctionReturn(0); 175 } 176 177 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 178 { 179 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 180 181 PetscFunctionBegin; 182 switch (op) { 183 case MAT_CUSPARSE_MULT: 184 cusparsestruct->format = format; 185 break; 186 case MAT_CUSPARSE_ALL: 187 cusparsestruct->format = format; 188 break; 189 default: 190 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 191 } 192 PetscFunctionReturn(0); 193 } 194 195 /*@ 196 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 197 operation. Only the MatMult operation can use different GPU storage formats 198 for MPIAIJCUSPARSE matrices. 199 Not Collective 200 201 Input Parameters: 202 + A - Matrix of type SEQAIJCUSPARSE 203 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 204 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 205 206 Output Parameter: 207 208 Level: intermediate 209 210 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 211 @*/ 212 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 213 { 214 PetscErrorCode ierr; 215 216 PetscFunctionBegin; 217 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 218 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 219 PetscFunctionReturn(0); 220 } 221 222 /*@ 223 MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose 224 225 Collective on mat 226 227 Input Parameters: 228 + A - Matrix of type SEQAIJCUSPARSE 229 - transgen - the boolean flag 230 231 Level: intermediate 232 233 .seealso: MATSEQAIJCUSPARSE, MatAIJCUSPARSESetGenerateTranspose() 234 @*/ 235 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 236 { 237 PetscErrorCode ierr; 238 PetscBool flg; 239 240 PetscFunctionBegin; 241 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 242 ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 243 if (flg) { 244 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 245 246 if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 247 cusp->transgen = transgen; 248 if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 249 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 250 } 251 } 252 PetscFunctionReturn(0); 253 } 254 255 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 256 { 257 PetscErrorCode ierr; 258 MatCUSPARSEStorageFormat format; 259 PetscBool flg; 260 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 261 262 PetscFunctionBegin; 263 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 264 if (A->factortype == MAT_FACTOR_NONE) { 265 PetscBool transgen = cusparsestruct->transgen; 266 267 ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 268 if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);} 269 270 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 271 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 272 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 273 274 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 275 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 276 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 277 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 278 cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 279 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 280 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 281 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 282 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 283 284 cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 285 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 286 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 287 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 288 289 cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 290 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 291 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 292 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 293 #endif 294 } 295 ierr = PetscOptionsTail();CHKERRQ(ierr); 296 PetscFunctionReturn(0); 297 } 298 299 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 300 { 301 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 302 PetscErrorCode ierr; 303 304 PetscFunctionBegin; 305 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 306 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 307 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 308 PetscFunctionReturn(0); 309 } 310 311 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 312 { 313 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 314 PetscErrorCode ierr; 315 316 PetscFunctionBegin; 317 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 318 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 319 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 320 PetscFunctionReturn(0); 321 } 322 323 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 324 { 325 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 326 PetscErrorCode ierr; 327 328 PetscFunctionBegin; 329 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 330 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 331 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 332 PetscFunctionReturn(0); 333 } 334 335 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 336 { 337 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 338 PetscErrorCode ierr; 339 340 PetscFunctionBegin; 341 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 342 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 343 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 344 PetscFunctionReturn(0); 345 } 346 347 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 348 { 349 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 350 PetscInt n = A->rmap->n; 351 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 352 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 353 cusparseStatus_t stat; 354 const PetscInt *ai = a->i,*aj = a->j,*vi; 355 const MatScalar *aa = a->a,*v; 356 PetscInt *AiLo, *AjLo; 357 PetscInt i,nz, nzLower, offset, rowOffset; 358 PetscErrorCode ierr; 359 cudaError_t cerr; 360 361 PetscFunctionBegin; 362 if (!n) PetscFunctionReturn(0); 363 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 364 try { 365 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 366 nzLower=n+ai[n]-ai[1]; 367 if (!loTriFactor) { 368 PetscScalar *AALo; 369 370 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 371 372 /* Allocate Space for the lower triangular matrix */ 373 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 374 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 375 376 /* Fill the lower triangular matrix */ 377 AiLo[0] = (PetscInt) 0; 378 AiLo[n] = nzLower; 379 AjLo[0] = (PetscInt) 0; 380 AALo[0] = (MatScalar) 1.0; 381 v = aa; 382 vi = aj; 383 offset = 1; 384 rowOffset= 1; 385 for (i=1; i<n; i++) { 386 nz = ai[i+1] - ai[i]; 387 /* additional 1 for the term on the diagonal */ 388 AiLo[i] = rowOffset; 389 rowOffset += nz+1; 390 391 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 392 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 393 394 offset += nz; 395 AjLo[offset] = (PetscInt) i; 396 AALo[offset] = (MatScalar) 1.0; 397 offset += 1; 398 399 v += nz; 400 vi += nz; 401 } 402 403 /* allocate space for the triangular factor information */ 404 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 405 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 406 /* Create the matrix description */ 407 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 408 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 409 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 410 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 411 #else 412 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 413 #endif 414 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 415 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 416 417 /* set the operation */ 418 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 419 420 /* set the matrix */ 421 loTriFactor->csrMat = new CsrMatrix; 422 loTriFactor->csrMat->num_rows = n; 423 loTriFactor->csrMat->num_cols = n; 424 loTriFactor->csrMat->num_entries = nzLower; 425 426 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 427 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 428 429 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 430 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 431 432 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 433 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 434 435 /* Create the solve analysis information */ 436 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 437 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 438 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 439 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 440 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 441 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 442 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 443 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 444 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 445 #endif 446 447 /* perform the solve analysis */ 448 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 449 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 450 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 451 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 452 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 453 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 454 #endif 455 );CHKERRCUSPARSE(stat); 456 cerr = WaitForCUDA();CHKERRCUDA(cerr); 457 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 458 459 /* assign the pointer */ 460 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 461 loTriFactor->AA_h = AALo; 462 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 463 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 464 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 465 } else { /* update values only */ 466 if (!loTriFactor->AA_h) { 467 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 468 } 469 /* Fill the lower triangular matrix */ 470 loTriFactor->AA_h[0] = 1.0; 471 v = aa; 472 vi = aj; 473 offset = 1; 474 for (i=1; i<n; i++) { 475 nz = ai[i+1] - ai[i]; 476 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 477 offset += nz; 478 loTriFactor->AA_h[offset] = 1.0; 479 offset += 1; 480 v += nz; 481 } 482 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 483 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 484 } 485 } catch(char *ex) { 486 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 487 } 488 } 489 PetscFunctionReturn(0); 490 } 491 492 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 493 { 494 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 495 PetscInt n = A->rmap->n; 496 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 497 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 498 cusparseStatus_t stat; 499 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 500 const MatScalar *aa = a->a,*v; 501 PetscInt *AiUp, *AjUp; 502 PetscInt i,nz, nzUpper, offset; 503 PetscErrorCode ierr; 504 cudaError_t cerr; 505 506 PetscFunctionBegin; 507 if (!n) PetscFunctionReturn(0); 508 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 509 try { 510 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 511 nzUpper = adiag[0]-adiag[n]; 512 if (!upTriFactor) { 513 PetscScalar *AAUp; 514 515 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 516 517 /* Allocate Space for the upper triangular matrix */ 518 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 519 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 520 521 /* Fill the upper triangular matrix */ 522 AiUp[0]=(PetscInt) 0; 523 AiUp[n]=nzUpper; 524 offset = nzUpper; 525 for (i=n-1; i>=0; i--) { 526 v = aa + adiag[i+1] + 1; 527 vi = aj + adiag[i+1] + 1; 528 529 /* number of elements NOT on the diagonal */ 530 nz = adiag[i] - adiag[i+1]-1; 531 532 /* decrement the offset */ 533 offset -= (nz+1); 534 535 /* first, set the diagonal elements */ 536 AjUp[offset] = (PetscInt) i; 537 AAUp[offset] = (MatScalar)1./v[nz]; 538 AiUp[i] = AiUp[i+1] - (nz+1); 539 540 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 541 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 542 } 543 544 /* allocate space for the triangular factor information */ 545 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 546 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 547 548 /* Create the matrix description */ 549 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 550 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 551 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 552 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 553 #else 554 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 555 #endif 556 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 557 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 558 559 /* set the operation */ 560 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 561 562 /* set the matrix */ 563 upTriFactor->csrMat = new CsrMatrix; 564 upTriFactor->csrMat->num_rows = n; 565 upTriFactor->csrMat->num_cols = n; 566 upTriFactor->csrMat->num_entries = nzUpper; 567 568 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 569 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 570 571 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 572 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 573 574 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 575 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 576 577 /* Create the solve analysis information */ 578 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 579 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 580 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 581 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 582 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 583 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 584 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 585 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 586 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 587 #endif 588 589 /* perform the solve analysis */ 590 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 591 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 592 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 593 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 594 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 595 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 596 #endif 597 );CHKERRCUSPARSE(stat); 598 cerr = WaitForCUDA();CHKERRCUDA(cerr); 599 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 600 601 /* assign the pointer */ 602 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 603 upTriFactor->AA_h = AAUp; 604 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 605 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 606 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 607 } else { 608 if (!upTriFactor->AA_h) { 609 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 610 } 611 /* Fill the upper triangular matrix */ 612 offset = nzUpper; 613 for (i=n-1; i>=0; i--) { 614 v = aa + adiag[i+1] + 1; 615 616 /* number of elements NOT on the diagonal */ 617 nz = adiag[i] - adiag[i+1]-1; 618 619 /* decrement the offset */ 620 offset -= (nz+1); 621 622 /* first, set the diagonal elements */ 623 upTriFactor->AA_h[offset] = 1./v[nz]; 624 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 625 } 626 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 627 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 628 } 629 } catch(char *ex) { 630 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 631 } 632 } 633 PetscFunctionReturn(0); 634 } 635 636 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 637 { 638 PetscErrorCode ierr; 639 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 640 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 641 IS isrow = a->row,iscol = a->icol; 642 PetscBool row_identity,col_identity; 643 PetscInt n = A->rmap->n; 644 645 PetscFunctionBegin; 646 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 647 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 648 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 649 650 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 651 cusparseTriFactors->nnz=a->nz; 652 653 A->offloadmask = PETSC_OFFLOAD_BOTH; 654 /* lower triangular indices */ 655 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 656 if (!row_identity && !cusparseTriFactors->rpermIndices) { 657 const PetscInt *r; 658 659 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 660 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 661 cusparseTriFactors->rpermIndices->assign(r, r+n); 662 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 663 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 664 } 665 666 /* upper triangular indices */ 667 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 668 if (!col_identity && !cusparseTriFactors->cpermIndices) { 669 const PetscInt *c; 670 671 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 672 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 673 cusparseTriFactors->cpermIndices->assign(c, c+n); 674 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 675 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 676 } 677 PetscFunctionReturn(0); 678 } 679 680 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 681 { 682 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 683 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 684 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 685 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 686 cusparseStatus_t stat; 687 PetscErrorCode ierr; 688 cudaError_t cerr; 689 PetscInt *AiUp, *AjUp; 690 PetscScalar *AAUp; 691 PetscScalar *AALo; 692 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 693 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 694 const PetscInt *ai = b->i,*aj = b->j,*vj; 695 const MatScalar *aa = b->a,*v; 696 697 PetscFunctionBegin; 698 if (!n) PetscFunctionReturn(0); 699 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 700 try { 701 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 702 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 703 if (!upTriFactor && !loTriFactor) { 704 /* Allocate Space for the upper triangular matrix */ 705 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 706 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 707 708 /* Fill the upper triangular matrix */ 709 AiUp[0]=(PetscInt) 0; 710 AiUp[n]=nzUpper; 711 offset = 0; 712 for (i=0; i<n; i++) { 713 /* set the pointers */ 714 v = aa + ai[i]; 715 vj = aj + ai[i]; 716 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 717 718 /* first, set the diagonal elements */ 719 AjUp[offset] = (PetscInt) i; 720 AAUp[offset] = (MatScalar)1.0/v[nz]; 721 AiUp[i] = offset; 722 AALo[offset] = (MatScalar)1.0/v[nz]; 723 724 offset+=1; 725 if (nz>0) { 726 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 727 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 728 for (j=offset; j<offset+nz; j++) { 729 AAUp[j] = -AAUp[j]; 730 AALo[j] = AAUp[j]/v[nz]; 731 } 732 offset+=nz; 733 } 734 } 735 736 /* allocate space for the triangular factor information */ 737 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 738 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 739 740 /* Create the matrix description */ 741 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 742 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 743 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 744 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 745 #else 746 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 747 #endif 748 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 749 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 750 751 /* set the matrix */ 752 upTriFactor->csrMat = new CsrMatrix; 753 upTriFactor->csrMat->num_rows = A->rmap->n; 754 upTriFactor->csrMat->num_cols = A->cmap->n; 755 upTriFactor->csrMat->num_entries = a->nz; 756 757 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 758 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 759 760 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 761 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 762 763 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 764 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 765 766 /* set the operation */ 767 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 768 769 /* Create the solve analysis information */ 770 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 771 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 772 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 773 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 774 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 775 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 776 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 777 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 778 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 779 #endif 780 781 /* perform the solve analysis */ 782 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 783 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 784 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 785 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 786 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 787 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 788 #endif 789 );CHKERRCUSPARSE(stat); 790 cerr = WaitForCUDA();CHKERRCUDA(cerr); 791 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 792 793 /* assign the pointer */ 794 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 795 796 /* allocate space for the triangular factor information */ 797 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 798 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 799 800 /* Create the matrix description */ 801 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 802 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 803 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 804 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 805 #else 806 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 807 #endif 808 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 809 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 810 811 /* set the operation */ 812 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 813 814 /* set the matrix */ 815 loTriFactor->csrMat = new CsrMatrix; 816 loTriFactor->csrMat->num_rows = A->rmap->n; 817 loTriFactor->csrMat->num_cols = A->cmap->n; 818 loTriFactor->csrMat->num_entries = a->nz; 819 820 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 821 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 822 823 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 824 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 825 826 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 827 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 828 829 /* Create the solve analysis information */ 830 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 831 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 832 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 833 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 834 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 835 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 836 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 837 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 838 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 839 #endif 840 841 /* perform the solve analysis */ 842 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 843 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 844 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 845 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 846 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 847 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 848 #endif 849 );CHKERRCUSPARSE(stat); 850 cerr = WaitForCUDA();CHKERRCUDA(cerr); 851 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 852 853 /* assign the pointer */ 854 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 855 856 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 857 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 858 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 859 } else { 860 /* Fill the upper triangular matrix */ 861 offset = 0; 862 for (i=0; i<n; i++) { 863 /* set the pointers */ 864 v = aa + ai[i]; 865 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 866 867 /* first, set the diagonal elements */ 868 AAUp[offset] = 1.0/v[nz]; 869 AALo[offset] = 1.0/v[nz]; 870 871 offset+=1; 872 if (nz>0) { 873 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 874 for (j=offset; j<offset+nz; j++) { 875 AAUp[j] = -AAUp[j]; 876 AALo[j] = AAUp[j]/v[nz]; 877 } 878 offset+=nz; 879 } 880 } 881 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 882 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 883 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 884 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 885 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 886 } 887 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 888 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 889 } catch(char *ex) { 890 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 891 } 892 } 893 PetscFunctionReturn(0); 894 } 895 896 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 897 { 898 PetscErrorCode ierr; 899 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 900 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 901 IS ip = a->row; 902 PetscBool perm_identity; 903 PetscInt n = A->rmap->n; 904 905 PetscFunctionBegin; 906 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 907 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 908 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 909 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 910 911 A->offloadmask = PETSC_OFFLOAD_BOTH; 912 913 /* lower triangular indices */ 914 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 915 if (!perm_identity) { 916 IS iip; 917 const PetscInt *irip,*rip; 918 919 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 920 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 921 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 922 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 923 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 924 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 925 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 926 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 927 ierr = ISDestroy(&iip);CHKERRQ(ierr); 928 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 929 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 930 } 931 PetscFunctionReturn(0); 932 } 933 934 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 935 { 936 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 937 IS isrow = b->row,iscol = b->col; 938 PetscBool row_identity,col_identity; 939 PetscErrorCode ierr; 940 941 PetscFunctionBegin; 942 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 943 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 944 B->offloadmask = PETSC_OFFLOAD_CPU; 945 /* determine which version of MatSolve needs to be used. */ 946 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 947 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 948 if (row_identity && col_identity) { 949 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 950 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 951 B->ops->matsolve = NULL; 952 B->ops->matsolvetranspose = NULL; 953 } else { 954 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 955 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 956 B->ops->matsolve = NULL; 957 B->ops->matsolvetranspose = NULL; 958 } 959 960 /* get the triangular factors */ 961 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 962 PetscFunctionReturn(0); 963 } 964 965 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 966 { 967 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 968 IS ip = b->row; 969 PetscBool perm_identity; 970 PetscErrorCode ierr; 971 972 PetscFunctionBegin; 973 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 974 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 975 B->offloadmask = PETSC_OFFLOAD_CPU; 976 /* determine which version of MatSolve needs to be used. */ 977 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 978 if (perm_identity) { 979 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 980 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 981 B->ops->matsolve = NULL; 982 B->ops->matsolvetranspose = NULL; 983 } else { 984 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 985 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 986 B->ops->matsolve = NULL; 987 B->ops->matsolvetranspose = NULL; 988 } 989 990 /* get the triangular factors */ 991 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 992 PetscFunctionReturn(0); 993 } 994 995 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 996 { 997 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 998 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 999 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1000 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1001 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1002 cusparseStatus_t stat; 1003 cusparseIndexBase_t indexBase; 1004 cusparseMatrixType_t matrixType; 1005 cusparseFillMode_t fillMode; 1006 cusparseDiagType_t diagType; 1007 cudaError_t cerr; 1008 PetscErrorCode ierr; 1009 1010 PetscFunctionBegin; 1011 /* allocate space for the transpose of the lower triangular factor */ 1012 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1013 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1014 1015 /* set the matrix descriptors of the lower triangular factor */ 1016 matrixType = cusparseGetMatType(loTriFactor->descr); 1017 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1018 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1019 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1020 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1021 1022 /* Create the matrix description */ 1023 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1024 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1025 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1026 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1027 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1028 1029 /* set the operation */ 1030 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1031 1032 /* allocate GPU space for the CSC of the lower triangular factor*/ 1033 loTriFactorT->csrMat = new CsrMatrix; 1034 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1035 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1036 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1037 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1038 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1039 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1040 1041 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1042 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1043 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1044 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1045 loTriFactor->csrMat->values->data().get(), 1046 loTriFactor->csrMat->row_offsets->data().get(), 1047 loTriFactor->csrMat->column_indices->data().get(), 1048 loTriFactorT->csrMat->values->data().get(), 1049 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1050 CUSPARSE_ACTION_NUMERIC,indexBase, 1051 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1052 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1053 #endif 1054 1055 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1056 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1057 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1058 loTriFactor->csrMat->values->data().get(), 1059 loTriFactor->csrMat->row_offsets->data().get(), 1060 loTriFactor->csrMat->column_indices->data().get(), 1061 loTriFactorT->csrMat->values->data().get(), 1062 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1063 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1064 CUSPARSE_ACTION_NUMERIC, indexBase, 1065 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 1066 #else 1067 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1068 CUSPARSE_ACTION_NUMERIC, indexBase 1069 #endif 1070 );CHKERRCUSPARSE(stat); 1071 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1072 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1073 1074 /* Create the solve analysis information */ 1075 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1076 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1077 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1078 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1079 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1080 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1081 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1082 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1083 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1084 #endif 1085 1086 /* perform the solve analysis */ 1087 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1088 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1089 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1090 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 1091 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1092 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1093 #endif 1094 );CHKERRCUSPARSE(stat); 1095 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1096 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1097 1098 /* assign the pointer */ 1099 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1100 1101 /*********************************************/ 1102 /* Now the Transpose of the Upper Tri Factor */ 1103 /*********************************************/ 1104 1105 /* allocate space for the transpose of the upper triangular factor */ 1106 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1107 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1108 1109 /* set the matrix descriptors of the upper triangular factor */ 1110 matrixType = cusparseGetMatType(upTriFactor->descr); 1111 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1112 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1113 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1114 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1115 1116 /* Create the matrix description */ 1117 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1118 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1119 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1120 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1121 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1122 1123 /* set the operation */ 1124 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1125 1126 /* allocate GPU space for the CSC of the upper triangular factor*/ 1127 upTriFactorT->csrMat = new CsrMatrix; 1128 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1129 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1130 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1131 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1132 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1133 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1134 1135 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1136 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1137 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1138 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1139 upTriFactor->csrMat->values->data().get(), 1140 upTriFactor->csrMat->row_offsets->data().get(), 1141 upTriFactor->csrMat->column_indices->data().get(), 1142 upTriFactorT->csrMat->values->data().get(), 1143 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1144 CUSPARSE_ACTION_NUMERIC,indexBase, 1145 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1146 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1147 #endif 1148 1149 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1150 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1151 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1152 upTriFactor->csrMat->values->data().get(), 1153 upTriFactor->csrMat->row_offsets->data().get(), 1154 upTriFactor->csrMat->column_indices->data().get(), 1155 upTriFactorT->csrMat->values->data().get(), 1156 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1157 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1158 CUSPARSE_ACTION_NUMERIC, indexBase, 1159 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1160 #else 1161 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1162 CUSPARSE_ACTION_NUMERIC, indexBase 1163 #endif 1164 );CHKERRCUSPARSE(stat); 1165 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1166 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1167 1168 /* Create the solve analysis information */ 1169 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1170 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1171 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1172 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1173 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1174 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1175 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1176 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1177 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1178 #endif 1179 1180 /* perform the solve analysis */ 1181 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1182 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1183 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1184 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1185 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1186 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1187 #endif 1188 );CHKERRCUSPARSE(stat); 1189 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1190 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1191 1192 /* assign the pointer */ 1193 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1194 PetscFunctionReturn(0); 1195 } 1196 1197 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 1198 { 1199 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1200 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1201 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1202 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1203 cusparseStatus_t stat; 1204 cusparseIndexBase_t indexBase; 1205 cudaError_t err; 1206 PetscErrorCode ierr; 1207 1208 PetscFunctionBegin; 1209 if (!cusparsestruct->transgen || cusparsestruct->matTranspose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 1210 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1211 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1212 /* create cusparse matrix */ 1213 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1214 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1215 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1216 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1217 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1218 1219 /* set alpha and beta */ 1220 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1221 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1222 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1223 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1224 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1225 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1226 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1227 1228 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1229 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1230 CsrMatrix *matrixT= new CsrMatrix; 1231 matrixT->num_rows = A->cmap->n; 1232 matrixT->num_cols = A->rmap->n; 1233 matrixT->num_entries = a->nz; 1234 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1235 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1236 matrixT->values = new THRUSTARRAY(a->nz); 1237 1238 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1239 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1240 1241 /* compute the transpose, i.e. the CSC */ 1242 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1243 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1244 A->cmap->n, matrix->num_entries, 1245 matrix->values->data().get(), 1246 cusparsestruct->rowoffsets_gpu->data().get(), 1247 matrix->column_indices->data().get(), 1248 matrixT->values->data().get(), 1249 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1250 CUSPARSE_ACTION_NUMERIC,indexBase, 1251 cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1252 err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err); 1253 #endif 1254 1255 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1256 A->cmap->n, matrix->num_entries, 1257 matrix->values->data().get(), 1258 cusparsestruct->rowoffsets_gpu->data().get(), 1259 matrix->column_indices->data().get(), 1260 matrixT->values->data().get(), 1261 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1262 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1263 CUSPARSE_ACTION_NUMERIC,indexBase, 1264 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1265 #else 1266 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1267 CUSPARSE_ACTION_NUMERIC, indexBase 1268 #endif 1269 );CHKERRCUSPARSE(stat); 1270 matstructT->mat = matrixT; 1271 1272 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1273 stat = cusparseCreateCsr(&matstructT->matDescr, 1274 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1275 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1276 matrixT->values->data().get(), 1277 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1278 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1279 #endif 1280 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1281 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1282 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1283 #else 1284 CsrMatrix *temp = new CsrMatrix; 1285 CsrMatrix *tempT = new CsrMatrix; 1286 /* First convert HYB to CSR */ 1287 temp->num_rows = A->rmap->n; 1288 temp->num_cols = A->cmap->n; 1289 temp->num_entries = a->nz; 1290 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1291 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1292 temp->values = new THRUSTARRAY(a->nz); 1293 1294 stat = cusparse_hyb2csr(cusparsestruct->handle, 1295 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1296 temp->values->data().get(), 1297 temp->row_offsets->data().get(), 1298 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1299 1300 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1301 tempT->num_rows = A->rmap->n; 1302 tempT->num_cols = A->cmap->n; 1303 tempT->num_entries = a->nz; 1304 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1305 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1306 tempT->values = new THRUSTARRAY(a->nz); 1307 1308 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1309 temp->num_cols, temp->num_entries, 1310 temp->values->data().get(), 1311 temp->row_offsets->data().get(), 1312 temp->column_indices->data().get(), 1313 tempT->values->data().get(), 1314 tempT->column_indices->data().get(), 1315 tempT->row_offsets->data().get(), 1316 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1317 1318 /* Last, convert CSC to HYB */ 1319 cusparseHybMat_t hybMat; 1320 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1321 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1322 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1323 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1324 matstructT->descr, tempT->values->data().get(), 1325 tempT->row_offsets->data().get(), 1326 tempT->column_indices->data().get(), 1327 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1328 1329 /* assign the pointer */ 1330 matstructT->mat = hybMat; 1331 /* delete temporaries */ 1332 if (tempT) { 1333 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1334 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1335 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1336 delete (CsrMatrix*) tempT; 1337 } 1338 if (temp) { 1339 if (temp->values) delete (THRUSTARRAY*) temp->values; 1340 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1341 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1342 delete (CsrMatrix*) temp; 1343 } 1344 #endif 1345 } 1346 err = WaitForCUDA();CHKERRCUDA(err); 1347 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1348 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1349 /* the compressed row indices is not used for matTranspose */ 1350 matstructT->cprowIndices = NULL; 1351 /* assign the pointer */ 1352 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1353 PetscFunctionReturn(0); 1354 } 1355 1356 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1357 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1358 { 1359 PetscInt n = xx->map->n; 1360 const PetscScalar *barray; 1361 PetscScalar *xarray; 1362 thrust::device_ptr<const PetscScalar> bGPU; 1363 thrust::device_ptr<PetscScalar> xGPU; 1364 cusparseStatus_t stat; 1365 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1366 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1367 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1368 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1369 PetscErrorCode ierr; 1370 cudaError_t cerr; 1371 1372 PetscFunctionBegin; 1373 /* Analyze the matrix and create the transpose ... on the fly */ 1374 if (!loTriFactorT && !upTriFactorT) { 1375 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1376 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1377 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1378 } 1379 1380 /* Get the GPU pointers */ 1381 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1382 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1383 xGPU = thrust::device_pointer_cast(xarray); 1384 bGPU = thrust::device_pointer_cast(barray); 1385 1386 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1387 /* First, reorder with the row permutation */ 1388 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1389 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1390 xGPU); 1391 1392 /* First, solve U */ 1393 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1394 upTriFactorT->csrMat->num_rows, 1395 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1396 upTriFactorT->csrMat->num_entries, 1397 #endif 1398 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1399 upTriFactorT->csrMat->values->data().get(), 1400 upTriFactorT->csrMat->row_offsets->data().get(), 1401 upTriFactorT->csrMat->column_indices->data().get(), 1402 upTriFactorT->solveInfo, 1403 xarray, tempGPU->data().get() 1404 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1405 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1406 #endif 1407 );CHKERRCUSPARSE(stat); 1408 1409 /* Then, solve L */ 1410 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1411 loTriFactorT->csrMat->num_rows, 1412 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1413 loTriFactorT->csrMat->num_entries, 1414 #endif 1415 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1416 loTriFactorT->csrMat->values->data().get(), 1417 loTriFactorT->csrMat->row_offsets->data().get(), 1418 loTriFactorT->csrMat->column_indices->data().get(), 1419 loTriFactorT->solveInfo, 1420 tempGPU->data().get(), xarray 1421 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1422 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1423 #endif 1424 );CHKERRCUSPARSE(stat); 1425 1426 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1427 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1428 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1429 tempGPU->begin()); 1430 1431 /* Copy the temporary to the full solution. */ 1432 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1433 1434 /* restore */ 1435 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1436 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1437 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1438 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1439 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1440 PetscFunctionReturn(0); 1441 } 1442 1443 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1444 { 1445 const PetscScalar *barray; 1446 PetscScalar *xarray; 1447 cusparseStatus_t stat; 1448 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1449 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1450 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1451 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1452 PetscErrorCode ierr; 1453 cudaError_t cerr; 1454 1455 PetscFunctionBegin; 1456 /* Analyze the matrix and create the transpose ... on the fly */ 1457 if (!loTriFactorT && !upTriFactorT) { 1458 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1459 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1460 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1461 } 1462 1463 /* Get the GPU pointers */ 1464 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1465 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1466 1467 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1468 /* First, solve U */ 1469 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1470 upTriFactorT->csrMat->num_rows, 1471 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1472 upTriFactorT->csrMat->num_entries, 1473 #endif 1474 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1475 upTriFactorT->csrMat->values->data().get(), 1476 upTriFactorT->csrMat->row_offsets->data().get(), 1477 upTriFactorT->csrMat->column_indices->data().get(), 1478 upTriFactorT->solveInfo, 1479 barray, tempGPU->data().get() 1480 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1481 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1482 #endif 1483 );CHKERRCUSPARSE(stat); 1484 1485 /* Then, solve L */ 1486 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1487 loTriFactorT->csrMat->num_rows, 1488 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1489 loTriFactorT->csrMat->num_entries, 1490 #endif 1491 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1492 loTriFactorT->csrMat->values->data().get(), 1493 loTriFactorT->csrMat->row_offsets->data().get(), 1494 loTriFactorT->csrMat->column_indices->data().get(), 1495 loTriFactorT->solveInfo, 1496 tempGPU->data().get(), xarray 1497 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1498 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1499 #endif 1500 );CHKERRCUSPARSE(stat); 1501 1502 /* restore */ 1503 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1504 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1505 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1506 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1507 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1508 PetscFunctionReturn(0); 1509 } 1510 1511 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1512 { 1513 const PetscScalar *barray; 1514 PetscScalar *xarray; 1515 thrust::device_ptr<const PetscScalar> bGPU; 1516 thrust::device_ptr<PetscScalar> xGPU; 1517 cusparseStatus_t stat; 1518 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1519 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1520 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1521 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1522 PetscErrorCode ierr; 1523 cudaError_t cerr; 1524 1525 PetscFunctionBegin; 1526 1527 /* Get the GPU pointers */ 1528 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1529 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1530 xGPU = thrust::device_pointer_cast(xarray); 1531 bGPU = thrust::device_pointer_cast(barray); 1532 1533 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1534 /* First, reorder with the row permutation */ 1535 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1536 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1537 tempGPU->begin()); 1538 1539 /* Next, solve L */ 1540 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1541 loTriFactor->csrMat->num_rows, 1542 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1543 loTriFactor->csrMat->num_entries, 1544 #endif 1545 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1546 loTriFactor->csrMat->values->data().get(), 1547 loTriFactor->csrMat->row_offsets->data().get(), 1548 loTriFactor->csrMat->column_indices->data().get(), 1549 loTriFactor->solveInfo, 1550 tempGPU->data().get(), xarray 1551 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1552 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1553 #endif 1554 );CHKERRCUSPARSE(stat); 1555 1556 /* Then, solve U */ 1557 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1558 upTriFactor->csrMat->num_rows, 1559 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1560 upTriFactor->csrMat->num_entries, 1561 #endif 1562 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1563 upTriFactor->csrMat->values->data().get(), 1564 upTriFactor->csrMat->row_offsets->data().get(), 1565 upTriFactor->csrMat->column_indices->data().get(), 1566 upTriFactor->solveInfo, 1567 xarray, tempGPU->data().get() 1568 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1569 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1570 #endif 1571 );CHKERRCUSPARSE(stat); 1572 1573 /* Last, reorder with the column permutation */ 1574 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1575 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1576 xGPU); 1577 1578 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1579 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1580 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1581 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1582 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1583 PetscFunctionReturn(0); 1584 } 1585 1586 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1587 { 1588 const PetscScalar *barray; 1589 PetscScalar *xarray; 1590 cusparseStatus_t stat; 1591 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1592 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1593 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1594 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1595 PetscErrorCode ierr; 1596 cudaError_t cerr; 1597 1598 PetscFunctionBegin; 1599 /* Get the GPU pointers */ 1600 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1601 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1602 1603 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1604 /* First, solve L */ 1605 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1606 loTriFactor->csrMat->num_rows, 1607 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1608 loTriFactor->csrMat->num_entries, 1609 #endif 1610 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1611 loTriFactor->csrMat->values->data().get(), 1612 loTriFactor->csrMat->row_offsets->data().get(), 1613 loTriFactor->csrMat->column_indices->data().get(), 1614 loTriFactor->solveInfo, 1615 barray, tempGPU->data().get() 1616 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1617 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1618 #endif 1619 );CHKERRCUSPARSE(stat); 1620 1621 /* Next, solve U */ 1622 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1623 upTriFactor->csrMat->num_rows, 1624 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1625 upTriFactor->csrMat->num_entries, 1626 #endif 1627 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1628 upTriFactor->csrMat->values->data().get(), 1629 upTriFactor->csrMat->row_offsets->data().get(), 1630 upTriFactor->csrMat->column_indices->data().get(), 1631 upTriFactor->solveInfo, 1632 tempGPU->data().get(), xarray 1633 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1634 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1635 #endif 1636 );CHKERRCUSPARSE(stat); 1637 1638 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1639 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1640 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1641 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1642 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1643 PetscFunctionReturn(0); 1644 } 1645 1646 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1647 { 1648 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1649 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1650 cudaError_t cerr; 1651 PetscErrorCode ierr; 1652 1653 PetscFunctionBegin; 1654 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1655 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1656 1657 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1658 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1659 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1660 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1661 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1662 A->offloadmask = PETSC_OFFLOAD_BOTH; 1663 } 1664 PetscFunctionReturn(0); 1665 } 1666 1667 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1668 { 1669 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1670 PetscErrorCode ierr; 1671 1672 PetscFunctionBegin; 1673 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1674 *array = a->a; 1675 A->offloadmask = PETSC_OFFLOAD_CPU; 1676 PetscFunctionReturn(0); 1677 } 1678 1679 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1680 { 1681 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1682 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1683 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1684 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1685 PetscErrorCode ierr; 1686 cusparseStatus_t stat; 1687 PetscBool both = PETSC_TRUE; 1688 cudaError_t err; 1689 1690 PetscFunctionBegin; 1691 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU"); 1692 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1693 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1694 /* Copy values only */ 1695 CsrMatrix *matrix,*matrixT; 1696 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1697 1698 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values"); 1699 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1700 matrix->values->assign(a->a, a->a+a->nz); 1701 err = WaitForCUDA();CHKERRCUDA(err); 1702 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1703 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1704 1705 /* Update matT when it was built before */ 1706 if (cusparsestruct->matTranspose) { 1707 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1708 matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1709 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1710 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1711 A->cmap->n, matrix->num_entries, 1712 matrix->values->data().get(), 1713 cusparsestruct->rowoffsets_gpu->data().get(), 1714 matrix->column_indices->data().get(), 1715 matrixT->values->data().get(), 1716 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1717 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1718 CUSPARSE_ACTION_NUMERIC,indexBase, 1719 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1720 #else 1721 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1722 CUSPARSE_ACTION_NUMERIC, indexBase 1723 #endif 1724 );CHKERRCUSPARSE(stat); 1725 err = WaitForCUDA();CHKERRCUDA(err); 1726 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1727 } 1728 } else { 1729 PetscInt nnz; 1730 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1731 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1732 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1733 delete cusparsestruct->workVector; 1734 delete cusparsestruct->rowoffsets_gpu; 1735 try { 1736 if (a->compressedrow.use) { 1737 m = a->compressedrow.nrows; 1738 ii = a->compressedrow.i; 1739 ridx = a->compressedrow.rindex; 1740 } else { 1741 m = A->rmap->n; 1742 ii = a->i; 1743 ridx = NULL; 1744 } 1745 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data"); 1746 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data"); 1747 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1748 else nnz = a->nz; 1749 1750 /* create cusparse matrix */ 1751 cusparsestruct->nrows = m; 1752 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1753 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1754 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1755 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1756 1757 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1758 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1759 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1760 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1761 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1762 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1763 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1764 1765 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1766 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1767 /* set the matrix */ 1768 CsrMatrix *mat= new CsrMatrix; 1769 mat->num_rows = m; 1770 mat->num_cols = A->cmap->n; 1771 mat->num_entries = nnz; 1772 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1773 mat->row_offsets->assign(ii, ii + m+1); 1774 1775 mat->column_indices = new THRUSTINTARRAY32(nnz); 1776 mat->column_indices->assign(a->j, a->j+nnz); 1777 1778 mat->values = new THRUSTARRAY(nnz); 1779 if (a->a) mat->values->assign(a->a, a->a+nnz); 1780 1781 /* assign the pointer */ 1782 matstruct->mat = mat; 1783 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1784 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1785 stat = cusparseCreateCsr(&matstruct->matDescr, 1786 mat->num_rows, mat->num_cols, mat->num_entries, 1787 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1788 mat->values->data().get(), 1789 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1790 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1791 } 1792 #endif 1793 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1794 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1795 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1796 #else 1797 CsrMatrix *mat= new CsrMatrix; 1798 mat->num_rows = m; 1799 mat->num_cols = A->cmap->n; 1800 mat->num_entries = nnz; 1801 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1802 mat->row_offsets->assign(ii, ii + m+1); 1803 1804 mat->column_indices = new THRUSTINTARRAY32(nnz); 1805 mat->column_indices->assign(a->j, a->j+nnz); 1806 1807 mat->values = new THRUSTARRAY(nnz); 1808 if (a->a) mat->values->assign(a->a, a->a+nnz); 1809 1810 cusparseHybMat_t hybMat; 1811 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1812 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1813 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1814 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1815 matstruct->descr, mat->values->data().get(), 1816 mat->row_offsets->data().get(), 1817 mat->column_indices->data().get(), 1818 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1819 /* assign the pointer */ 1820 matstruct->mat = hybMat; 1821 1822 if (mat) { 1823 if (mat->values) delete (THRUSTARRAY*)mat->values; 1824 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1825 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1826 delete (CsrMatrix*)mat; 1827 } 1828 #endif 1829 } 1830 1831 /* assign the compressed row indices */ 1832 if (a->compressedrow.use) { 1833 cusparsestruct->workVector = new THRUSTARRAY(m); 1834 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1835 matstruct->cprowIndices->assign(ridx,ridx+m); 1836 tmp = m; 1837 } else { 1838 cusparsestruct->workVector = NULL; 1839 matstruct->cprowIndices = NULL; 1840 tmp = 0; 1841 } 1842 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1843 1844 /* assign the pointer */ 1845 cusparsestruct->mat = matstruct; 1846 } catch(char *ex) { 1847 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1848 } 1849 err = WaitForCUDA();CHKERRCUDA(err); 1850 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1851 cusparsestruct->nonzerostate = A->nonzerostate; 1852 } 1853 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1854 } 1855 PetscFunctionReturn(0); 1856 } 1857 1858 struct VecCUDAPlusEquals 1859 { 1860 template <typename Tuple> 1861 __host__ __device__ 1862 void operator()(Tuple t) 1863 { 1864 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1865 } 1866 }; 1867 1868 struct VecCUDAEquals 1869 { 1870 template <typename Tuple> 1871 __host__ __device__ 1872 void operator()(Tuple t) 1873 { 1874 thrust::get<1>(t) = thrust::get<0>(t); 1875 } 1876 }; 1877 1878 struct VecCUDAEqualsReverse 1879 { 1880 template <typename Tuple> 1881 __host__ __device__ 1882 void operator()(Tuple t) 1883 { 1884 thrust::get<0>(t) = thrust::get<1>(t); 1885 } 1886 }; 1887 1888 struct MatMatCusparse { 1889 PetscBool cisdense; 1890 PetscScalar *Bt; 1891 Mat X; 1892 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1893 PetscLogDouble flops; 1894 CsrMatrix *Bcsr; 1895 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1896 cusparseSpMatDescr_t matSpBDescr; 1897 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1898 cusparseDnMatDescr_t matBDescr; 1899 cusparseDnMatDescr_t matCDescr; 1900 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1901 size_t mmBufferSize; 1902 void *mmBuffer; 1903 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1904 cusparseSpGEMMDescr_t spgemmDesc; 1905 #endif 1906 }; 1907 1908 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1909 { 1910 PetscErrorCode ierr; 1911 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1912 cudaError_t cerr; 1913 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1914 cusparseStatus_t stat; 1915 #endif 1916 1917 PetscFunctionBegin; 1918 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1919 delete mmdata->Bcsr; 1920 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1921 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1922 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1923 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1924 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1925 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1926 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1927 #endif 1928 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1929 ierr = PetscFree(data);CHKERRQ(ierr); 1930 PetscFunctionReturn(0); 1931 } 1932 1933 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1934 1935 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1936 { 1937 Mat_Product *product = C->product; 1938 Mat A,B; 1939 PetscInt m,n,blda,clda; 1940 PetscBool flg,biscuda; 1941 Mat_SeqAIJCUSPARSE *cusp; 1942 cusparseStatus_t stat; 1943 cusparseOperation_t opA; 1944 const PetscScalar *barray; 1945 PetscScalar *carray; 1946 PetscErrorCode ierr; 1947 MatMatCusparse *mmdata; 1948 Mat_SeqAIJCUSPARSEMultStruct *mat; 1949 CsrMatrix *csrmat; 1950 cudaError_t cerr; 1951 1952 PetscFunctionBegin; 1953 MatCheckProduct(C,1); 1954 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1955 mmdata = (MatMatCusparse*)product->data; 1956 A = product->A; 1957 B = product->B; 1958 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1959 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1960 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1961 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1962 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1963 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1964 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1965 switch (product->type) { 1966 case MATPRODUCT_AB: 1967 case MATPRODUCT_PtAP: 1968 mat = cusp->mat; 1969 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1970 m = A->rmap->n; 1971 n = B->cmap->n; 1972 break; 1973 case MATPRODUCT_AtB: 1974 if (!cusp->transgen) { 1975 mat = cusp->mat; 1976 opA = CUSPARSE_OPERATION_TRANSPOSE; 1977 } else { 1978 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1979 mat = cusp->matTranspose; 1980 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1981 } 1982 m = A->cmap->n; 1983 n = B->cmap->n; 1984 break; 1985 case MATPRODUCT_ABt: 1986 case MATPRODUCT_RARt: 1987 mat = cusp->mat; 1988 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1989 m = A->rmap->n; 1990 n = B->rmap->n; 1991 break; 1992 default: 1993 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1994 } 1995 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1996 csrmat = (CsrMatrix*)mat->mat; 1997 /* if the user passed a CPU matrix, copy the data to the GPU */ 1998 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1999 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2000 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2001 2002 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2003 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2004 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2005 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2006 } else { 2007 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2008 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2009 } 2010 2011 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2012 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2013 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2014 /* (re)allcoate mmBuffer if not initialized or LDAs are different */ 2015 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2016 size_t mmBufferSize; 2017 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2018 if (!mmdata->matBDescr) { 2019 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2020 mmdata->Blda = blda; 2021 } 2022 2023 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2024 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2025 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2026 mmdata->Clda = clda; 2027 } 2028 2029 if (!mat->matDescr) { 2030 stat = cusparseCreateCsr(&mat->matDescr, 2031 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2032 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2033 csrmat->values->data().get(), 2034 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2035 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2036 } 2037 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2038 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2039 mmdata->matCDescr,cusparse_scalartype, 2040 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2041 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2042 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2043 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2044 mmdata->mmBufferSize = mmBufferSize; 2045 } 2046 mmdata->initialized = PETSC_TRUE; 2047 } else { 2048 /* to be safe, always update pointers of the mats */ 2049 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2050 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2051 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2052 } 2053 2054 /* do cusparseSpMM, which supports transpose on B */ 2055 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2056 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2057 mmdata->matCDescr,cusparse_scalartype, 2058 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2059 #else 2060 PetscInt k; 2061 /* cusparseXcsrmm does not support transpose on B */ 2062 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2063 cublasHandle_t cublasv2handle; 2064 cublasStatus_t cerr; 2065 2066 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2067 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2068 B->cmap->n,B->rmap->n, 2069 &PETSC_CUSPARSE_ONE ,barray,blda, 2070 &PETSC_CUSPARSE_ZERO,barray,blda, 2071 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2072 blda = B->cmap->n; 2073 k = B->cmap->n; 2074 } else { 2075 k = B->rmap->n; 2076 } 2077 2078 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2079 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2080 csrmat->num_entries,mat->alpha_one,mat->descr, 2081 csrmat->values->data().get(), 2082 csrmat->row_offsets->data().get(), 2083 csrmat->column_indices->data().get(), 2084 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2085 carray,clda);CHKERRCUSPARSE(stat); 2086 #endif 2087 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2088 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2089 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2090 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2091 if (product->type == MATPRODUCT_RARt) { 2092 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2093 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2094 } else if (product->type == MATPRODUCT_PtAP) { 2095 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2096 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2097 } else { 2098 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2099 } 2100 if (mmdata->cisdense) { 2101 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2102 } 2103 if (!biscuda) { 2104 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2105 } 2106 PetscFunctionReturn(0); 2107 } 2108 2109 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2110 { 2111 Mat_Product *product = C->product; 2112 Mat A,B; 2113 PetscInt m,n; 2114 PetscBool cisdense,flg; 2115 PetscErrorCode ierr; 2116 MatMatCusparse *mmdata; 2117 Mat_SeqAIJCUSPARSE *cusp; 2118 2119 PetscFunctionBegin; 2120 MatCheckProduct(C,1); 2121 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2122 A = product->A; 2123 B = product->B; 2124 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2125 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2126 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2127 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2128 switch (product->type) { 2129 case MATPRODUCT_AB: 2130 m = A->rmap->n; 2131 n = B->cmap->n; 2132 break; 2133 case MATPRODUCT_AtB: 2134 m = A->cmap->n; 2135 n = B->cmap->n; 2136 break; 2137 case MATPRODUCT_ABt: 2138 m = A->rmap->n; 2139 n = B->rmap->n; 2140 break; 2141 case MATPRODUCT_PtAP: 2142 m = B->cmap->n; 2143 n = B->cmap->n; 2144 break; 2145 case MATPRODUCT_RARt: 2146 m = B->rmap->n; 2147 n = B->rmap->n; 2148 break; 2149 default: 2150 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2151 } 2152 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2153 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2154 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2155 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2156 2157 /* product data */ 2158 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2159 mmdata->cisdense = cisdense; 2160 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2161 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2162 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2163 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2164 } 2165 #endif 2166 /* for these products we need intermediate storage */ 2167 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2168 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2169 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2170 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2171 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2172 } else { 2173 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2174 } 2175 } 2176 C->product->data = mmdata; 2177 C->product->destroy = MatDestroy_MatMatCusparse; 2178 2179 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2180 PetscFunctionReturn(0); 2181 } 2182 2183 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2184 { 2185 Mat_Product *product = C->product; 2186 Mat A,B; 2187 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2188 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2189 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2190 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2191 PetscBool flg; 2192 PetscErrorCode ierr; 2193 cusparseStatus_t stat; 2194 cudaError_t cerr; 2195 MatProductType ptype; 2196 MatMatCusparse *mmdata; 2197 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2198 cusparseSpMatDescr_t BmatSpDescr; 2199 #endif 2200 2201 PetscFunctionBegin; 2202 MatCheckProduct(C,1); 2203 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 2204 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2205 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name); 2206 mmdata = (MatMatCusparse*)C->product->data; 2207 A = product->A; 2208 B = product->B; 2209 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2210 mmdata->reusesym = PETSC_FALSE; 2211 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2212 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2213 Cmat = Ccusp->mat; 2214 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2215 Ccsr = (CsrMatrix*)Cmat->mat; 2216 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2217 goto finalize; 2218 } 2219 if (!c->nz) goto finalize; 2220 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2221 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2222 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2223 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2224 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2225 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2226 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2227 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2228 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2229 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2230 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2231 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2232 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2233 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2234 2235 ptype = product->type; 2236 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2237 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2238 switch (ptype) { 2239 case MATPRODUCT_AB: 2240 Amat = Acusp->mat; 2241 Bmat = Bcusp->mat; 2242 break; 2243 case MATPRODUCT_AtB: 2244 Amat = Acusp->matTranspose; 2245 Bmat = Bcusp->mat; 2246 break; 2247 case MATPRODUCT_ABt: 2248 Amat = Acusp->mat; 2249 Bmat = Bcusp->matTranspose; 2250 break; 2251 default: 2252 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2253 } 2254 Cmat = Ccusp->mat; 2255 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2256 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2257 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2258 Acsr = (CsrMatrix*)Amat->mat; 2259 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2260 Ccsr = (CsrMatrix*)Cmat->mat; 2261 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2262 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2263 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2264 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2265 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2266 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2267 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2268 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2269 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2270 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2271 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2272 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2273 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2274 #else 2275 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2276 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2277 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2278 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2279 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2280 #endif 2281 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2282 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2283 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2284 C->offloadmask = PETSC_OFFLOAD_GPU; 2285 finalize: 2286 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2287 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2288 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2289 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2290 c->reallocs = 0; 2291 C->info.mallocs += 0; 2292 C->info.nz_unneeded = 0; 2293 C->assembled = C->was_assembled = PETSC_TRUE; 2294 C->num_ass++; 2295 PetscFunctionReturn(0); 2296 } 2297 2298 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2299 { 2300 Mat_Product *product = C->product; 2301 Mat A,B; 2302 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2303 Mat_SeqAIJ *a,*b,*c; 2304 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2305 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2306 PetscInt i,j,m,n,k; 2307 PetscBool flg; 2308 PetscErrorCode ierr; 2309 cusparseStatus_t stat; 2310 cudaError_t cerr; 2311 MatProductType ptype; 2312 MatMatCusparse *mmdata; 2313 PetscLogDouble flops; 2314 PetscBool biscompressed,ciscompressed; 2315 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2316 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2317 size_t bufSize2; 2318 cusparseSpMatDescr_t BmatSpDescr; 2319 #else 2320 int cnz; 2321 #endif 2322 2323 PetscFunctionBegin; 2324 MatCheckProduct(C,1); 2325 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2326 A = product->A; 2327 B = product->B; 2328 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2329 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2330 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2331 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2332 a = (Mat_SeqAIJ*)A->data; 2333 b = (Mat_SeqAIJ*)B->data; 2334 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2335 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2336 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2337 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2338 2339 /* product data */ 2340 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2341 C->product->data = mmdata; 2342 C->product->destroy = MatDestroy_MatMatCusparse; 2343 2344 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2345 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2346 ptype = product->type; 2347 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2348 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2349 biscompressed = PETSC_FALSE; 2350 ciscompressed = PETSC_FALSE; 2351 switch (ptype) { 2352 case MATPRODUCT_AB: 2353 m = A->rmap->n; 2354 n = B->cmap->n; 2355 k = A->cmap->n; 2356 Amat = Acusp->mat; 2357 Bmat = Bcusp->mat; 2358 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2359 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2360 break; 2361 case MATPRODUCT_AtB: 2362 m = A->cmap->n; 2363 n = B->cmap->n; 2364 k = A->rmap->n; 2365 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 2366 Amat = Acusp->matTranspose; 2367 Bmat = Bcusp->mat; 2368 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2369 break; 2370 case MATPRODUCT_ABt: 2371 m = A->rmap->n; 2372 n = B->rmap->n; 2373 k = A->cmap->n; 2374 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr); 2375 Amat = Acusp->mat; 2376 Bmat = Bcusp->matTranspose; 2377 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2378 break; 2379 default: 2380 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2381 } 2382 2383 /* create cusparse matrix */ 2384 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2385 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2386 c = (Mat_SeqAIJ*)C->data; 2387 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2388 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2389 Ccsr = new CsrMatrix; 2390 2391 c->compressedrow.use = ciscompressed; 2392 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2393 c->compressedrow.nrows = a->compressedrow.nrows; 2394 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2395 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2396 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2397 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2398 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2399 } else { 2400 c->compressedrow.nrows = 0; 2401 c->compressedrow.i = NULL; 2402 c->compressedrow.rindex = NULL; 2403 Ccusp->workVector = NULL; 2404 Cmat->cprowIndices = NULL; 2405 } 2406 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2407 Ccusp->mat = Cmat; 2408 Ccusp->mat->mat = Ccsr; 2409 Ccsr->num_rows = Ccusp->nrows; 2410 Ccsr->num_cols = n; 2411 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2412 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2413 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2414 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2415 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2416 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2417 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2418 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2419 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2420 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2421 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2422 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2423 c->nz = 0; 2424 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2425 Ccsr->values = new THRUSTARRAY(c->nz); 2426 goto finalizesym; 2427 } 2428 2429 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2430 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2431 Acsr = (CsrMatrix*)Amat->mat; 2432 if (!biscompressed) { 2433 Bcsr = (CsrMatrix*)Bmat->mat; 2434 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2435 BmatSpDescr = Bmat->matDescr; 2436 #endif 2437 } else { /* we need to use row offsets for the full matrix */ 2438 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2439 Bcsr = new CsrMatrix; 2440 Bcsr->num_rows = B->rmap->n; 2441 Bcsr->num_cols = cBcsr->num_cols; 2442 Bcsr->num_entries = cBcsr->num_entries; 2443 Bcsr->column_indices = cBcsr->column_indices; 2444 Bcsr->values = cBcsr->values; 2445 if (!Bcusp->rowoffsets_gpu) { 2446 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2447 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2448 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2449 } 2450 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2451 mmdata->Bcsr = Bcsr; 2452 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2453 if (Bcsr->num_rows && Bcsr->num_cols) { 2454 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2455 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2456 Bcsr->values->data().get(), 2457 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2458 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2459 } 2460 BmatSpDescr = mmdata->matSpBDescr; 2461 #endif 2462 } 2463 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2464 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2465 /* precompute flops count */ 2466 if (ptype == MATPRODUCT_AB) { 2467 for (i=0, flops = 0; i<A->rmap->n; i++) { 2468 const PetscInt st = a->i[i]; 2469 const PetscInt en = a->i[i+1]; 2470 for (j=st; j<en; j++) { 2471 const PetscInt brow = a->j[j]; 2472 flops += 2.*(b->i[brow+1] - b->i[brow]); 2473 } 2474 } 2475 } else if (ptype == MATPRODUCT_AtB) { 2476 for (i=0, flops = 0; i<A->rmap->n; i++) { 2477 const PetscInt anzi = a->i[i+1] - a->i[i]; 2478 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2479 flops += (2.*anzi)*bnzi; 2480 } 2481 } else { /* TODO */ 2482 flops = 0.; 2483 } 2484 2485 mmdata->flops = flops; 2486 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2487 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2488 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2489 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2490 NULL, NULL, NULL, 2491 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2492 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2493 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2494 /* ask bufferSize bytes for external memory */ 2495 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2496 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2497 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2498 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2499 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2500 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2501 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2502 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2503 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2504 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2505 /* ask bufferSize again bytes for external memory */ 2506 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2507 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2508 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2509 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2510 /* The CUSPARSE documentation is not clear, nor the API 2511 We need both buffers to perform the operations properly! 2512 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2513 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2514 is stored in the descriptor! What a messy API... */ 2515 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2516 /* compute the intermediate product of A * B */ 2517 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2518 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2519 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2520 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2521 /* get matrix C non-zero entries C_nnz1 */ 2522 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2523 c->nz = (PetscInt) C_nnz1; 2524 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2525 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2526 Ccsr->values = new THRUSTARRAY(c->nz); 2527 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2528 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2529 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2530 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2531 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2532 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2533 #else 2534 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2535 stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2536 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2537 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2538 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2539 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2540 c->nz = cnz; 2541 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2542 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2543 Ccsr->values = new THRUSTARRAY(c->nz); 2544 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2545 2546 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2547 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2548 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2549 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2550 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2551 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2552 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2553 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2554 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2555 #endif 2556 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2557 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2558 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2559 finalizesym: 2560 c->singlemalloc = PETSC_FALSE; 2561 c->free_a = PETSC_TRUE; 2562 c->free_ij = PETSC_TRUE; 2563 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2564 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2565 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2566 PetscInt *d_i = c->i; 2567 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2568 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2569 ii = *Ccsr->row_offsets; 2570 jj = *Ccsr->column_indices; 2571 if (ciscompressed) d_i = c->compressedrow.i; 2572 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2573 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2574 } else { 2575 PetscInt *d_i = c->i; 2576 if (ciscompressed) d_i = c->compressedrow.i; 2577 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2578 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2579 } 2580 if (ciscompressed) { /* need to expand host row offsets */ 2581 PetscInt r = 0; 2582 c->i[0] = 0; 2583 for (k = 0; k < c->compressedrow.nrows; k++) { 2584 const PetscInt next = c->compressedrow.rindex[k]; 2585 const PetscInt old = c->compressedrow.i[k]; 2586 for (; r < next; r++) c->i[r+1] = old; 2587 } 2588 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2589 } 2590 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2591 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2592 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2593 c->maxnz = c->nz; 2594 c->nonzerorowcnt = 0; 2595 c->rmax = 0; 2596 for (k = 0; k < m; k++) { 2597 const PetscInt nn = c->i[k+1] - c->i[k]; 2598 c->ilen[k] = c->imax[k] = nn; 2599 c->nonzerorowcnt += (PetscInt)!!nn; 2600 c->rmax = PetscMax(c->rmax,nn); 2601 } 2602 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2603 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2604 Ccsr->num_entries = c->nz; 2605 2606 C->nonzerostate++; 2607 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2608 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2609 Ccusp->nonzerostate = C->nonzerostate; 2610 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2611 C->preallocated = PETSC_TRUE; 2612 C->assembled = PETSC_FALSE; 2613 C->was_assembled = PETSC_FALSE; 2614 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2615 mmdata->reusesym = PETSC_TRUE; 2616 C->offloadmask = PETSC_OFFLOAD_GPU; 2617 } 2618 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2619 PetscFunctionReturn(0); 2620 } 2621 2622 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2623 2624 /* handles sparse or dense B */ 2625 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2626 { 2627 Mat_Product *product = mat->product; 2628 PetscErrorCode ierr; 2629 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2630 2631 PetscFunctionBegin; 2632 MatCheckProduct(mat,1); 2633 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2634 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2635 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2636 } 2637 if (product->type == MATPRODUCT_ABC) { 2638 Ciscusp = PETSC_FALSE; 2639 if (!product->C->boundtocpu) { 2640 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2641 } 2642 } 2643 if (isdense) { 2644 switch (product->type) { 2645 case MATPRODUCT_AB: 2646 case MATPRODUCT_AtB: 2647 case MATPRODUCT_ABt: 2648 case MATPRODUCT_PtAP: 2649 case MATPRODUCT_RARt: 2650 if (product->A->boundtocpu) { 2651 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2652 } else { 2653 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2654 } 2655 break; 2656 case MATPRODUCT_ABC: 2657 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2658 break; 2659 default: 2660 break; 2661 } 2662 } else if (Biscusp && Ciscusp) { 2663 switch (product->type) { 2664 case MATPRODUCT_AB: 2665 case MATPRODUCT_AtB: 2666 case MATPRODUCT_ABt: 2667 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2668 break; 2669 case MATPRODUCT_PtAP: 2670 case MATPRODUCT_RARt: 2671 case MATPRODUCT_ABC: 2672 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2673 break; 2674 default: 2675 break; 2676 } 2677 } else { /* fallback for AIJ */ 2678 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2679 } 2680 PetscFunctionReturn(0); 2681 } 2682 2683 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2684 { 2685 PetscErrorCode ierr; 2686 2687 PetscFunctionBegin; 2688 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2689 PetscFunctionReturn(0); 2690 } 2691 2692 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2693 { 2694 PetscErrorCode ierr; 2695 2696 PetscFunctionBegin; 2697 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2698 PetscFunctionReturn(0); 2699 } 2700 2701 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2702 { 2703 PetscErrorCode ierr; 2704 2705 PetscFunctionBegin; 2706 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2707 PetscFunctionReturn(0); 2708 } 2709 2710 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2711 { 2712 PetscErrorCode ierr; 2713 2714 PetscFunctionBegin; 2715 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2716 PetscFunctionReturn(0); 2717 } 2718 2719 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2720 { 2721 PetscErrorCode ierr; 2722 2723 PetscFunctionBegin; 2724 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2725 PetscFunctionReturn(0); 2726 } 2727 2728 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2729 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2730 { 2731 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2732 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2733 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2734 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2735 PetscErrorCode ierr; 2736 cudaError_t cerr; 2737 cusparseStatus_t stat; 2738 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2739 PetscBool compressed; 2740 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2741 PetscInt nx,ny; 2742 #endif 2743 2744 PetscFunctionBegin; 2745 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2746 if (!a->nonzerorowcnt) { 2747 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2748 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2749 PetscFunctionReturn(0); 2750 } 2751 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2752 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2753 if (!trans) { 2754 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2755 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2756 } else { 2757 if (herm || !cusparsestruct->transgen) { 2758 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2759 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2760 } else { 2761 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);} 2762 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2763 } 2764 } 2765 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2766 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2767 2768 try { 2769 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2770 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2771 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2772 2773 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2774 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2775 /* z = A x + beta y. 2776 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2777 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2778 */ 2779 xptr = xarray; 2780 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2781 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2782 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2783 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2784 allocated to accommodate different uses. So we get the length info directly from mat. 2785 */ 2786 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2787 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2788 nx = mat->num_cols; 2789 ny = mat->num_rows; 2790 } 2791 #endif 2792 } else { 2793 /* z = A^T x + beta y 2794 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2795 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2796 */ 2797 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2798 dptr = zarray; 2799 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2800 if (compressed) { /* Scatter x to work vector */ 2801 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2802 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2803 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2804 VecCUDAEqualsReverse()); 2805 } 2806 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2807 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2808 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2809 nx = mat->num_rows; 2810 ny = mat->num_cols; 2811 } 2812 #endif 2813 } 2814 2815 /* csr_spmv does y = alpha op(A) x + beta y */ 2816 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2817 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2818 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2819 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2820 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2821 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2822 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2823 matstruct->matDescr, 2824 matstruct->cuSpMV[opA].vecXDescr, beta, 2825 matstruct->cuSpMV[opA].vecYDescr, 2826 cusparse_scalartype, 2827 cusparsestruct->spmvAlg, 2828 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2829 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2830 2831 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2832 } else { 2833 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2834 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2835 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2836 } 2837 2838 stat = cusparseSpMV(cusparsestruct->handle, opA, 2839 matstruct->alpha_one, 2840 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */ 2841 matstruct->cuSpMV[opA].vecXDescr, 2842 beta, 2843 matstruct->cuSpMV[opA].vecYDescr, 2844 cusparse_scalartype, 2845 cusparsestruct->spmvAlg, 2846 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2847 #else 2848 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2849 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2850 mat->num_rows, mat->num_cols, 2851 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2852 mat->values->data().get(), mat->row_offsets->data().get(), 2853 mat->column_indices->data().get(), xptr, beta, 2854 dptr);CHKERRCUSPARSE(stat); 2855 #endif 2856 } else { 2857 if (cusparsestruct->nrows) { 2858 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2859 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2860 #else 2861 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2862 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2863 matstruct->alpha_one, matstruct->descr, hybMat, 2864 xptr, beta, 2865 dptr);CHKERRCUSPARSE(stat); 2866 #endif 2867 } 2868 } 2869 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2870 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2871 2872 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2873 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2874 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2875 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2876 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2877 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2878 } 2879 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2880 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2881 } 2882 2883 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2884 if (compressed) { 2885 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2886 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2887 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2888 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2889 VecCUDAPlusEquals()); 2890 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2891 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2892 } 2893 } else { 2894 if (yy && yy != zz) { 2895 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2896 } 2897 } 2898 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2899 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2900 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2901 } catch(char *ex) { 2902 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2903 } 2904 if (yy) { 2905 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2906 } else { 2907 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2908 } 2909 PetscFunctionReturn(0); 2910 } 2911 2912 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2913 { 2914 PetscErrorCode ierr; 2915 2916 PetscFunctionBegin; 2917 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2918 PetscFunctionReturn(0); 2919 } 2920 2921 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2922 { 2923 PetscErrorCode ierr; 2924 PetscSplitCSRDataStructure *d_mat = NULL; 2925 PetscFunctionBegin; 2926 if (A->factortype == MAT_FACTOR_NONE) { 2927 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2928 } 2929 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2930 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2931 if (d_mat) { 2932 A->offloadmask = PETSC_OFFLOAD_GPU; 2933 } 2934 2935 PetscFunctionReturn(0); 2936 } 2937 2938 /* --------------------------------------------------------------------------------*/ 2939 /*@ 2940 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2941 (the default parallel PETSc format). This matrix will ultimately pushed down 2942 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2943 assembly performance the user should preallocate the matrix storage by setting 2944 the parameter nz (or the array nnz). By setting these parameters accurately, 2945 performance during matrix assembly can be increased by more than a factor of 50. 2946 2947 Collective 2948 2949 Input Parameters: 2950 + comm - MPI communicator, set to PETSC_COMM_SELF 2951 . m - number of rows 2952 . n - number of columns 2953 . nz - number of nonzeros per row (same for all rows) 2954 - nnz - array containing the number of nonzeros in the various rows 2955 (possibly different for each row) or NULL 2956 2957 Output Parameter: 2958 . A - the matrix 2959 2960 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2961 MatXXXXSetPreallocation() paradgm instead of this routine directly. 2962 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 2963 2964 Notes: 2965 If nnz is given then nz is ignored 2966 2967 The AIJ format (also called the Yale sparse matrix format or 2968 compressed row storage), is fully compatible with standard Fortran 77 2969 storage. That is, the stored row and column indices can begin at 2970 either one (as in Fortran) or zero. See the users' manual for details. 2971 2972 Specify the preallocated storage with either nz or nnz (not both). 2973 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 2974 allocation. For large problems you MUST preallocate memory or you 2975 will get TERRIBLE performance, see the users' manual chapter on matrices. 2976 2977 By default, this format uses inodes (identical nodes) when possible, to 2978 improve numerical efficiency of matrix-vector products and solves. We 2979 search for consecutive rows with the same nonzero structure, thereby 2980 reusing matrix information to achieve increased efficiency. 2981 2982 Level: intermediate 2983 2984 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 2985 @*/ 2986 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 2987 { 2988 PetscErrorCode ierr; 2989 2990 PetscFunctionBegin; 2991 ierr = MatCreate(comm,A);CHKERRQ(ierr); 2992 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 2993 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2994 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 2995 PetscFunctionReturn(0); 2996 } 2997 2998 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 2999 { 3000 PetscErrorCode ierr; 3001 PetscSplitCSRDataStructure *d_mat = NULL; 3002 3003 PetscFunctionBegin; 3004 if (A->factortype == MAT_FACTOR_NONE) { 3005 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 3006 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 3007 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3008 } else { 3009 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3010 } 3011 if (d_mat) { 3012 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3013 cudaError_t err; 3014 PetscSplitCSRDataStructure h_mat; 3015 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 3016 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 3017 if (a->compressedrow.use) { 3018 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 3019 } 3020 err = cudaFree(d_mat);CHKERRCUDA(err); 3021 } 3022 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3023 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3024 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3025 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3026 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3027 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3028 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3029 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3030 PetscFunctionReturn(0); 3031 } 3032 3033 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3034 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3035 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3036 { 3037 PetscErrorCode ierr; 3038 3039 PetscFunctionBegin; 3040 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3041 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3042 PetscFunctionReturn(0); 3043 } 3044 3045 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3046 { 3047 PetscErrorCode ierr; 3048 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3049 Mat_SeqAIJCUSPARSE *cy; 3050 Mat_SeqAIJCUSPARSE *cx; 3051 PetscScalar *ay; 3052 const PetscScalar *ax; 3053 CsrMatrix *csry,*csrx; 3054 cudaError_t cerr; 3055 3056 PetscFunctionBegin; 3057 if (X->ops->axpy != Y->ops->axpy) { 3058 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3059 PetscFunctionReturn(0); 3060 } 3061 /* if we are here, it means both matrices are bound to GPU */ 3062 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3063 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3064 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3065 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3066 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3067 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3068 csry = (CsrMatrix*)cy->mat->mat; 3069 csrx = (CsrMatrix*)cx->mat->mat; 3070 /* see if we can turn this into a cublas axpy */ 3071 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3072 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3073 if (eq) { 3074 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3075 } 3076 if (eq) str = SAME_NONZERO_PATTERN; 3077 } 3078 3079 if (str == SUBSET_NONZERO_PATTERN) { 3080 cusparseStatus_t stat; 3081 PetscScalar b = 1.0; 3082 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3083 size_t bufferSize; 3084 void *buffer; 3085 #endif 3086 3087 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3088 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3089 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3090 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3091 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3092 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3093 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3094 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3095 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3096 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3097 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3098 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3099 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3100 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3101 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3102 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3103 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3104 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3105 #else 3106 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3107 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3108 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3109 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3110 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3111 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3112 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3113 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3114 #endif 3115 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3116 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3117 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3118 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3119 } else if (str == SAME_NONZERO_PATTERN) { 3120 cublasHandle_t cublasv2handle; 3121 cublasStatus_t berr; 3122 PetscBLASInt one = 1, bnz = 1; 3123 3124 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3125 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3126 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3127 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3128 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3129 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3130 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3131 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3132 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3133 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3134 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3135 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3136 } else { 3137 ierr = MatAXPY_SeqAIJ(Y,a,X,DIFFERENT_NONZERO_PATTERN);CHKERRQ(ierr); 3138 } 3139 PetscFunctionReturn(0); 3140 } 3141 3142 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3143 { 3144 PetscErrorCode ierr; 3145 PetscBool both = PETSC_FALSE; 3146 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3147 3148 PetscFunctionBegin; 3149 if (A->factortype == MAT_FACTOR_NONE) { 3150 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3151 if (spptr->mat) { 3152 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3153 if (matrix->values) { 3154 both = PETSC_TRUE; 3155 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3156 } 3157 } 3158 if (spptr->matTranspose) { 3159 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3160 if (matrix->values) { 3161 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3162 } 3163 } 3164 } 3165 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3166 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3167 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3168 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3169 else A->offloadmask = PETSC_OFFLOAD_CPU; 3170 3171 PetscFunctionReturn(0); 3172 } 3173 3174 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3175 { 3176 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3177 PetscErrorCode ierr; 3178 3179 PetscFunctionBegin; 3180 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3181 if (flg) { 3182 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3183 3184 A->ops->axpy = MatAXPY_SeqAIJ; 3185 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3186 A->ops->mult = MatMult_SeqAIJ; 3187 A->ops->multadd = MatMultAdd_SeqAIJ; 3188 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3189 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3190 A->ops->multhermitiantranspose = NULL; 3191 A->ops->multhermitiantransposeadd = NULL; 3192 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3193 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3194 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3195 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3196 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3197 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3198 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3199 } else { 3200 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3201 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3202 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3203 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3204 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3205 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3206 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3207 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3208 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3209 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3210 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3211 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3212 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3213 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3214 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3215 } 3216 A->boundtocpu = flg; 3217 a->inode.use = flg; 3218 PetscFunctionReturn(0); 3219 } 3220 3221 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3222 { 3223 PetscErrorCode ierr; 3224 cusparseStatus_t stat; 3225 Mat B; 3226 3227 PetscFunctionBegin; 3228 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3229 if (reuse == MAT_INITIAL_MATRIX) { 3230 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3231 } else if (reuse == MAT_REUSE_MATRIX) { 3232 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3233 } 3234 B = *newmat; 3235 3236 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3237 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3238 3239 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3240 if (B->factortype == MAT_FACTOR_NONE) { 3241 Mat_SeqAIJCUSPARSE *spptr; 3242 3243 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3244 spptr->format = MAT_CUSPARSE_CSR; 3245 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3246 B->spptr = spptr; 3247 spptr->deviceMat = NULL; 3248 } else { 3249 Mat_SeqAIJCUSPARSETriFactors *spptr; 3250 3251 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3252 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3253 B->spptr = spptr; 3254 } 3255 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3256 } 3257 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3258 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3259 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3260 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3261 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3262 3263 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3264 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3265 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3266 PetscFunctionReturn(0); 3267 } 3268 3269 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3270 { 3271 PetscErrorCode ierr; 3272 3273 PetscFunctionBegin; 3274 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3275 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3276 ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr); 3277 ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr); 3278 ierr = PetscOptionsEnd();CHKERRQ(ierr); 3279 PetscFunctionReturn(0); 3280 } 3281 3282 /*MC 3283 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3284 3285 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3286 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3287 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3288 3289 Options Database Keys: 3290 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3291 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3292 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3293 3294 Level: beginner 3295 3296 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3297 M*/ 3298 3299 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 3300 3301 3302 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3303 { 3304 PetscErrorCode ierr; 3305 3306 PetscFunctionBegin; 3307 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3308 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3309 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3310 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3311 PetscFunctionReturn(0); 3312 } 3313 3314 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3315 { 3316 PetscErrorCode ierr; 3317 cusparseStatus_t stat; 3318 3319 PetscFunctionBegin; 3320 if (*cusparsestruct) { 3321 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3322 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3323 delete (*cusparsestruct)->workVector; 3324 delete (*cusparsestruct)->rowoffsets_gpu; 3325 delete (*cusparsestruct)->cooPerm; 3326 delete (*cusparsestruct)->cooPerm_a; 3327 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3328 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3329 cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr); 3330 #endif 3331 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3332 } 3333 PetscFunctionReturn(0); 3334 } 3335 3336 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3337 { 3338 PetscFunctionBegin; 3339 if (*mat) { 3340 delete (*mat)->values; 3341 delete (*mat)->column_indices; 3342 delete (*mat)->row_offsets; 3343 delete *mat; 3344 *mat = 0; 3345 } 3346 PetscFunctionReturn(0); 3347 } 3348 3349 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3350 { 3351 cusparseStatus_t stat; 3352 PetscErrorCode ierr; 3353 3354 PetscFunctionBegin; 3355 if (*trifactor) { 3356 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3357 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3358 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3359 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3360 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3361 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3362 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3363 #endif 3364 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3365 } 3366 PetscFunctionReturn(0); 3367 } 3368 3369 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3370 { 3371 CsrMatrix *mat; 3372 cusparseStatus_t stat; 3373 cudaError_t err; 3374 3375 PetscFunctionBegin; 3376 if (*matstruct) { 3377 if ((*matstruct)->mat) { 3378 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3379 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3380 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3381 #else 3382 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3383 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3384 #endif 3385 } else { 3386 mat = (CsrMatrix*)(*matstruct)->mat; 3387 CsrMatrix_Destroy(&mat); 3388 } 3389 } 3390 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3391 delete (*matstruct)->cprowIndices; 3392 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3393 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3394 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3395 3396 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3397 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3398 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3399 for (int i=0; i<3; i++) { 3400 if (mdata->cuSpMV[i].initialized) { 3401 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3402 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3403 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3404 } 3405 } 3406 #endif 3407 delete *matstruct; 3408 *matstruct = NULL; 3409 } 3410 PetscFunctionReturn(0); 3411 } 3412 3413 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3414 { 3415 PetscErrorCode ierr; 3416 3417 PetscFunctionBegin; 3418 if (*trifactors) { 3419 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3420 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3421 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3422 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3423 delete (*trifactors)->rpermIndices; 3424 delete (*trifactors)->cpermIndices; 3425 delete (*trifactors)->workVector; 3426 (*trifactors)->rpermIndices = NULL; 3427 (*trifactors)->cpermIndices = NULL; 3428 (*trifactors)->workVector = NULL; 3429 } 3430 PetscFunctionReturn(0); 3431 } 3432 3433 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3434 { 3435 PetscErrorCode ierr; 3436 cusparseHandle_t handle; 3437 cusparseStatus_t stat; 3438 3439 PetscFunctionBegin; 3440 if (*trifactors) { 3441 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3442 if (handle = (*trifactors)->handle) { 3443 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3444 } 3445 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3446 } 3447 PetscFunctionReturn(0); 3448 } 3449 3450 struct IJCompare 3451 { 3452 __host__ __device__ 3453 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3454 { 3455 if (t1.get<0>() < t2.get<0>()) return true; 3456 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3457 return false; 3458 } 3459 }; 3460 3461 struct IJEqual 3462 { 3463 __host__ __device__ 3464 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3465 { 3466 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3467 return true; 3468 } 3469 }; 3470 3471 struct IJDiff 3472 { 3473 __host__ __device__ 3474 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3475 { 3476 return t1 == t2 ? 0 : 1; 3477 } 3478 }; 3479 3480 struct IJSum 3481 { 3482 __host__ __device__ 3483 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3484 { 3485 return t1||t2; 3486 } 3487 }; 3488 3489 #include <thrust/iterator/discard_iterator.h> 3490 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3491 { 3492 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3493 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3494 THRUSTARRAY *cooPerm_v = NULL; 3495 thrust::device_ptr<const PetscScalar> d_v; 3496 CsrMatrix *matrix; 3497 PetscErrorCode ierr; 3498 cudaError_t cerr; 3499 PetscInt n; 3500 3501 PetscFunctionBegin; 3502 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3503 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3504 if (!cusp->cooPerm) { 3505 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3506 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3507 PetscFunctionReturn(0); 3508 } 3509 matrix = (CsrMatrix*)cusp->mat->mat; 3510 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3511 if (!v) { 3512 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3513 goto finalize; 3514 } 3515 n = cusp->cooPerm->size(); 3516 if (isCudaMem(v)) { 3517 d_v = thrust::device_pointer_cast(v); 3518 } else { 3519 cooPerm_v = new THRUSTARRAY(n); 3520 cooPerm_v->assign(v,v+n); 3521 d_v = cooPerm_v->data(); 3522 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3523 } 3524 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3525 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3526 if (cusp->cooPerm_a) { 3527 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3528 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3529 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3530 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3531 delete cooPerm_w; 3532 } else { 3533 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3534 matrix->values->begin())); 3535 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3536 matrix->values->end())); 3537 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 3538 } 3539 } else { 3540 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3541 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3542 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3543 } else { 3544 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3545 matrix->values->begin())); 3546 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3547 matrix->values->end())); 3548 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3549 } 3550 } 3551 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3552 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3553 finalize: 3554 delete cooPerm_v; 3555 A->offloadmask = PETSC_OFFLOAD_GPU; 3556 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3557 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3558 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3559 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3560 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3561 a->reallocs = 0; 3562 A->info.mallocs += 0; 3563 A->info.nz_unneeded = 0; 3564 A->assembled = A->was_assembled = PETSC_TRUE; 3565 A->num_ass++; 3566 PetscFunctionReturn(0); 3567 } 3568 3569 #include <thrust/binary_search.h> 3570 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3571 { 3572 PetscErrorCode ierr; 3573 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3574 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3575 PetscInt cooPerm_n, nzr = 0; 3576 cudaError_t cerr; 3577 3578 PetscFunctionBegin; 3579 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3580 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3581 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3582 if (n != cooPerm_n) { 3583 delete cusp->cooPerm; 3584 delete cusp->cooPerm_a; 3585 cusp->cooPerm = NULL; 3586 cusp->cooPerm_a = NULL; 3587 } 3588 if (n) { 3589 THRUSTINTARRAY d_i(n); 3590 THRUSTINTARRAY d_j(n); 3591 THRUSTINTARRAY ii(A->rmap->n); 3592 3593 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3594 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3595 3596 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3597 d_i.assign(coo_i,coo_i+n); 3598 d_j.assign(coo_j,coo_j+n); 3599 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3600 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3601 3602 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3603 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3604 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 3605 *cusp->cooPerm_a = d_i; 3606 THRUSTINTARRAY w = d_j; 3607 3608 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 3609 if (nekey == ekey) { /* all entries are unique */ 3610 delete cusp->cooPerm_a; 3611 cusp->cooPerm_a = NULL; 3612 } else { /* I couldn't come up with a more elegant algorithm */ 3613 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 3614 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 3615 (*cusp->cooPerm_a)[0] = 0; 3616 w[0] = 0; 3617 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 3618 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 3619 } 3620 thrust::counting_iterator<PetscInt> search_begin(0); 3621 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 3622 search_begin, search_begin + A->rmap->n, 3623 ii.begin()); 3624 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3625 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3626 3627 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3628 a->singlemalloc = PETSC_FALSE; 3629 a->free_a = PETSC_TRUE; 3630 a->free_ij = PETSC_TRUE; 3631 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3632 a->i[0] = 0; 3633 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3634 a->nz = a->maxnz = a->i[A->rmap->n]; 3635 a->rmax = 0; 3636 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3637 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3638 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3639 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3640 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3641 for (PetscInt i = 0; i < A->rmap->n; i++) { 3642 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3643 nzr += (PetscInt)!!(nnzr); 3644 a->ilen[i] = a->imax[i] = nnzr; 3645 a->rmax = PetscMax(a->rmax,nnzr); 3646 } 3647 a->nonzerorowcnt = nzr; 3648 A->preallocated = PETSC_TRUE; 3649 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3650 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3651 } else { 3652 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3653 } 3654 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3655 3656 /* We want to allocate the CUSPARSE struct for matvec now. 3657 The code is so convoluted now that I prefer to copy zeros */ 3658 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3659 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3660 A->offloadmask = PETSC_OFFLOAD_CPU; 3661 A->nonzerostate++; 3662 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3663 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3664 3665 A->assembled = PETSC_FALSE; 3666 A->was_assembled = PETSC_FALSE; 3667 PetscFunctionReturn(0); 3668 } 3669 3670 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 3671 { 3672 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3673 CsrMatrix *csr; 3674 PetscErrorCode ierr; 3675 3676 PetscFunctionBegin; 3677 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3678 PetscValidPointer(a,2); 3679 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3680 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3681 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3682 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3683 csr = (CsrMatrix*)cusp->mat->mat; 3684 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3685 *a = csr->values->data().get(); 3686 PetscFunctionReturn(0); 3687 } 3688 3689 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 3690 { 3691 PetscFunctionBegin; 3692 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3693 PetscValidPointer(a,2); 3694 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3695 *a = NULL; 3696 PetscFunctionReturn(0); 3697 } 3698 3699 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 3700 { 3701 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3702 CsrMatrix *csr; 3703 PetscErrorCode ierr; 3704 3705 PetscFunctionBegin; 3706 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3707 PetscValidPointer(a,2); 3708 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3709 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3710 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3711 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3712 csr = (CsrMatrix*)cusp->mat->mat; 3713 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3714 *a = csr->values->data().get(); 3715 A->offloadmask = PETSC_OFFLOAD_GPU; 3716 PetscFunctionReturn(0); 3717 } 3718 3719 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 3720 { 3721 PetscErrorCode ierr; 3722 3723 PetscFunctionBegin; 3724 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3725 PetscValidPointer(a,2); 3726 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3727 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3728 *a = NULL; 3729 PetscFunctionReturn(0); 3730 } 3731 3732 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 3733 { 3734 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3735 CsrMatrix *csr; 3736 3737 PetscFunctionBegin; 3738 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3739 PetscValidPointer(a,2); 3740 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3741 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3742 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3743 csr = (CsrMatrix*)cusp->mat->mat; 3744 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3745 *a = csr->values->data().get(); 3746 A->offloadmask = PETSC_OFFLOAD_GPU; 3747 PetscFunctionReturn(0); 3748 } 3749 3750 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 3751 { 3752 PetscErrorCode ierr; 3753 3754 PetscFunctionBegin; 3755 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3756 PetscValidPointer(a,2); 3757 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3758 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3759 *a = NULL; 3760 PetscFunctionReturn(0); 3761 } 3762 3763 struct IJCompare4 3764 { 3765 __host__ __device__ 3766 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 3767 { 3768 if (t1.get<0>() < t2.get<0>()) return true; 3769 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3770 return false; 3771 } 3772 }; 3773 3774 struct Shift 3775 { 3776 int _shift; 3777 3778 Shift(int shift) : _shift(shift) {} 3779 __host__ __device__ 3780 inline int operator() (const int &c) 3781 { 3782 return c + _shift; 3783 } 3784 }; 3785 3786 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */ 3787 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 3788 { 3789 PetscErrorCode ierr; 3790 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 3791 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 3792 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 3793 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3794 PetscInt Annz,Bnnz; 3795 cusparseStatus_t stat; 3796 PetscInt i,m,n,zero = 0; 3797 cudaError_t cerr; 3798 3799 PetscFunctionBegin; 3800 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3801 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 3802 PetscValidPointer(C,4); 3803 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3804 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 3805 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 3806 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 3807 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3808 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3809 if (reuse == MAT_INITIAL_MATRIX) { 3810 m = A->rmap->n; 3811 n = A->cmap->n + B->cmap->n; 3812 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 3813 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 3814 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3815 c = (Mat_SeqAIJ*)(*C)->data; 3816 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3817 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3818 Ccsr = new CsrMatrix; 3819 Cmat->cprowIndices = NULL; 3820 c->compressedrow.use = PETSC_FALSE; 3821 c->compressedrow.nrows = 0; 3822 c->compressedrow.i = NULL; 3823 c->compressedrow.rindex = NULL; 3824 Ccusp->workVector = NULL; 3825 Ccusp->nrows = m; 3826 Ccusp->mat = Cmat; 3827 Ccusp->mat->mat = Ccsr; 3828 Ccsr->num_rows = m; 3829 Ccsr->num_cols = n; 3830 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 3831 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3832 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3833 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3834 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3835 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3836 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3837 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3838 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3839 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3840 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 3841 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 3842 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr); 3843 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3844 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3845 3846 Acsr = (CsrMatrix*)Acusp->mat->mat; 3847 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 3848 Annz = (PetscInt)Acsr->column_indices->size(); 3849 Bnnz = (PetscInt)Bcsr->column_indices->size(); 3850 c->nz = Annz + Bnnz; 3851 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 3852 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3853 Ccsr->values = new THRUSTARRAY(c->nz); 3854 Ccsr->num_entries = c->nz; 3855 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 3856 if (c->nz) { 3857 auto Acoo = new THRUSTINTARRAY32(Annz); 3858 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 3859 auto Ccoo = new THRUSTINTARRAY32(c->nz); 3860 THRUSTINTARRAY32 *Aroff,*Broff; 3861 3862 if (a->compressedrow.use) { /* need full row offset */ 3863 if (!Acusp->rowoffsets_gpu) { 3864 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 3865 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 3866 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3867 } 3868 Aroff = Acusp->rowoffsets_gpu; 3869 } else Aroff = Acsr->row_offsets; 3870 if (b->compressedrow.use) { /* need full row offset */ 3871 if (!Bcusp->rowoffsets_gpu) { 3872 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3873 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 3874 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3875 } 3876 Broff = Bcusp->rowoffsets_gpu; 3877 } else Broff = Bcsr->row_offsets; 3878 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3879 stat = cusparseXcsr2coo(Acusp->handle, 3880 Aroff->data().get(), 3881 Annz, 3882 m, 3883 Acoo->data().get(), 3884 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3885 stat = cusparseXcsr2coo(Bcusp->handle, 3886 Broff->data().get(), 3887 Bnnz, 3888 m, 3889 Bcoo->data().get(), 3890 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3891 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 3892 auto Aperm = thrust::make_constant_iterator(1); 3893 auto Bperm = thrust::make_constant_iterator(0); 3894 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 3895 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 3896 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 3897 #else 3898 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 3899 auto Bcib = Bcsr->column_indices->begin(); 3900 auto Bcie = Bcsr->column_indices->end(); 3901 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 3902 #endif 3903 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 3904 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 3905 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 3906 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 3907 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 3908 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 3909 auto p1 = Ccusp->cooPerm->begin(); 3910 auto p2 = Ccusp->cooPerm->begin(); 3911 thrust::advance(p2,Annz); 3912 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 3913 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 3914 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 3915 #endif 3916 auto cci = thrust::make_counting_iterator(zero); 3917 auto cce = thrust::make_counting_iterator(c->nz); 3918 #if 0 //Errors on SUMMIT cuda 11.1.0 3919 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 3920 #else 3921 auto pred = thrust::identity<int>(); 3922 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 3923 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 3924 #endif 3925 stat = cusparseXcoo2csr(Ccusp->handle, 3926 Ccoo->data().get(), 3927 c->nz, 3928 m, 3929 Ccsr->row_offsets->data().get(), 3930 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3931 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3932 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3933 delete wPerm; 3934 delete Acoo; 3935 delete Bcoo; 3936 delete Ccoo; 3937 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3938 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 3939 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 3940 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 3941 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 3942 #endif 3943 if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */ 3944 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 3945 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 3946 CsrMatrix *CcsrT = new CsrMatrix; 3947 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 3948 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 3949 3950 Ccusp->transgen = PETSC_TRUE; 3951 CmatT->cprowIndices = NULL; 3952 CmatT->mat = CcsrT; 3953 CcsrT->num_rows = n; 3954 CcsrT->num_cols = m; 3955 CcsrT->num_entries = c->nz; 3956 3957 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 3958 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 3959 CcsrT->values = new THRUSTARRAY(c->nz); 3960 3961 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3962 auto rT = CcsrT->row_offsets->begin(); 3963 if (AT) { 3964 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 3965 thrust::advance(rT,-1); 3966 } 3967 if (BT) { 3968 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 3969 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 3970 thrust::copy(titb,tite,rT); 3971 } 3972 auto cT = CcsrT->column_indices->begin(); 3973 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 3974 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 3975 auto vT = CcsrT->values->begin(); 3976 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 3977 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 3978 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3979 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3980 3981 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 3982 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3983 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3984 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3985 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3986 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3987 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3988 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3989 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3990 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3991 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 3992 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 3993 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 3994 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 3995 #endif 3996 Ccusp->matTranspose = CmatT; 3997 } 3998 } 3999 4000 c->singlemalloc = PETSC_FALSE; 4001 c->free_a = PETSC_TRUE; 4002 c->free_ij = PETSC_TRUE; 4003 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4004 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4005 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4006 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4007 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4008 ii = *Ccsr->row_offsets; 4009 jj = *Ccsr->column_indices; 4010 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4011 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4012 } else { 4013 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4014 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4015 } 4016 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4017 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4018 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4019 c->maxnz = c->nz; 4020 c->nonzerorowcnt = 0; 4021 c->rmax = 0; 4022 for (i = 0; i < m; i++) { 4023 const PetscInt nn = c->i[i+1] - c->i[i]; 4024 c->ilen[i] = c->imax[i] = nn; 4025 c->nonzerorowcnt += (PetscInt)!!nn; 4026 c->rmax = PetscMax(c->rmax,nn); 4027 } 4028 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4029 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4030 (*C)->nonzerostate++; 4031 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4032 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4033 Ccusp->nonzerostate = (*C)->nonzerostate; 4034 (*C)->preallocated = PETSC_TRUE; 4035 } else { 4036 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4037 c = (Mat_SeqAIJ*)(*C)->data; 4038 if (c->nz) { 4039 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4040 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4041 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4042 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4043 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4044 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4045 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4046 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4047 Acsr = (CsrMatrix*)Acusp->mat->mat; 4048 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4049 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4050 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4051 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4052 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4053 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4054 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4055 auto pmid = Ccusp->cooPerm->begin(); 4056 thrust::advance(pmid,Acsr->num_entries); 4057 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4058 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4059 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4060 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4061 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4062 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4063 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4064 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4065 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4066 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4067 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4068 if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) { 4069 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4070 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4071 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4072 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4073 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4074 auto vT = CcsrT->values->begin(); 4075 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4076 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4077 } 4078 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4079 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4080 } 4081 } 4082 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4083 (*C)->assembled = PETSC_TRUE; 4084 (*C)->was_assembled = PETSC_FALSE; 4085 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4086 PetscFunctionReturn(0); 4087 } 4088