1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 #include <thrust/async/for_each.h> 17 18 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 19 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 20 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 21 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 22 23 typedef enum { 24 CUSPARSE_MV_ALG_DEFAULT = 0, 25 CUSPARSE_COOMV_ALG = 1, 26 CUSPARSE_CSRMV_ALG1 = 2, 27 CUSPARSE_CSRMV_ALG2 = 3 28 } cusparseSpMVAlg_t; 29 30 typedef enum { 31 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 32 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 33 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 34 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 35 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 36 CUSPARSE_SPMM_ALG_DEFAULT = 0, 37 CUSPARSE_SPMM_COO_ALG1 = 1, 38 CUSPARSE_SPMM_COO_ALG2 = 2, 39 CUSPARSE_SPMM_COO_ALG3 = 3, 40 CUSPARSE_SPMM_COO_ALG4 = 5, 41 CUSPARSE_SPMM_CSR_ALG1 = 4, 42 CUSPARSE_SPMM_CSR_ALG2 = 6, 43 } cusparseSpMMAlg_t; 44 45 typedef enum { 46 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 47 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 48 } cusparseCsr2CscAlg_t; 49 */ 50 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 51 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 52 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 53 #endif 54 55 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 57 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 58 59 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 61 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 64 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 66 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 67 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 68 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 69 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 70 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 71 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 72 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 73 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 74 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 75 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 76 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 77 78 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 80 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 81 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 82 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 83 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 84 85 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 86 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 87 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 88 89 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 90 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 91 92 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 93 94 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 95 { 96 cusparseStatus_t stat; 97 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 98 99 PetscFunctionBegin; 100 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 101 cusparsestruct->stream = stream; 102 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 103 PetscFunctionReturn(0); 104 } 105 106 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 107 { 108 cusparseStatus_t stat; 109 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 110 111 PetscFunctionBegin; 112 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 113 if (cusparsestruct->handle != handle) { 114 if (cusparsestruct->handle) { 115 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 116 } 117 cusparsestruct->handle = handle; 118 } 119 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 120 PetscFunctionReturn(0); 121 } 122 123 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 124 { 125 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 126 PetscBool flg; 127 PetscErrorCode ierr; 128 129 PetscFunctionBegin; 130 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 131 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 132 if (cusparsestruct->handle) cusparsestruct->handle = 0; 133 PetscFunctionReturn(0); 134 } 135 136 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 137 { 138 PetscFunctionBegin; 139 *type = MATSOLVERCUSPARSE; 140 PetscFunctionReturn(0); 141 } 142 143 /*MC 144 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 145 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 146 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 147 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 148 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 149 algorithms are not recommended. This class does NOT support direct solver operations. 150 151 Level: beginner 152 153 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 154 M*/ 155 156 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 157 { 158 PetscErrorCode ierr; 159 PetscInt n = A->rmap->n; 160 161 PetscFunctionBegin; 162 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 163 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 164 (*B)->factortype = ftype; 165 (*B)->useordering = PETSC_TRUE; 166 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 167 168 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 169 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 170 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 171 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 172 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 173 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 174 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 175 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 176 177 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 178 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 179 PetscFunctionReturn(0); 180 } 181 182 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 183 { 184 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 185 186 PetscFunctionBegin; 187 switch (op) { 188 case MAT_CUSPARSE_MULT: 189 cusparsestruct->format = format; 190 break; 191 case MAT_CUSPARSE_ALL: 192 cusparsestruct->format = format; 193 break; 194 default: 195 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 196 } 197 PetscFunctionReturn(0); 198 } 199 200 /*@ 201 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 202 operation. Only the MatMult operation can use different GPU storage formats 203 for MPIAIJCUSPARSE matrices. 204 Not Collective 205 206 Input Parameters: 207 + A - Matrix of type SEQAIJCUSPARSE 208 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 209 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 210 211 Output Parameter: 212 213 Level: intermediate 214 215 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 216 @*/ 217 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 218 { 219 PetscErrorCode ierr; 220 221 PetscFunctionBegin; 222 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 223 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 224 PetscFunctionReturn(0); 225 } 226 227 /*@ 228 MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose 229 230 Collective on mat 231 232 Input Parameters: 233 + A - Matrix of type SEQAIJCUSPARSE 234 - transgen - the boolean flag 235 236 Level: intermediate 237 238 .seealso: MATSEQAIJCUSPARSE, MatAIJCUSPARSESetGenerateTranspose() 239 @*/ 240 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 241 { 242 PetscErrorCode ierr; 243 PetscBool flg; 244 245 PetscFunctionBegin; 246 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 247 ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 248 if (flg) { 249 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 250 251 if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 252 cusp->transgen = transgen; 253 if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 254 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 255 } 256 } 257 PetscFunctionReturn(0); 258 } 259 260 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 261 { 262 PetscErrorCode ierr; 263 MatCUSPARSEStorageFormat format; 264 PetscBool flg; 265 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 266 267 PetscFunctionBegin; 268 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 269 if (A->factortype == MAT_FACTOR_NONE) { 270 PetscBool transgen = cusparsestruct->transgen; 271 272 ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 273 if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);} 274 275 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 276 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 277 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 278 279 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 280 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 281 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 282 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 283 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 284 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 285 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 286 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 287 288 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 289 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 290 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 291 292 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 293 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 294 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 295 #endif 296 } 297 ierr = PetscOptionsTail();CHKERRQ(ierr); 298 PetscFunctionReturn(0); 299 } 300 301 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 302 { 303 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 304 PetscErrorCode ierr; 305 306 PetscFunctionBegin; 307 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 308 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 309 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 310 PetscFunctionReturn(0); 311 } 312 313 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 314 { 315 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 316 PetscErrorCode ierr; 317 318 PetscFunctionBegin; 319 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 320 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 321 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 322 PetscFunctionReturn(0); 323 } 324 325 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 326 { 327 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 328 PetscErrorCode ierr; 329 330 PetscFunctionBegin; 331 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 332 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 333 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 334 PetscFunctionReturn(0); 335 } 336 337 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 338 { 339 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 340 PetscErrorCode ierr; 341 342 PetscFunctionBegin; 343 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 344 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 345 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 346 PetscFunctionReturn(0); 347 } 348 349 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 350 { 351 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 352 PetscInt n = A->rmap->n; 353 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 354 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 355 cusparseStatus_t stat; 356 const PetscInt *ai = a->i,*aj = a->j,*vi; 357 const MatScalar *aa = a->a,*v; 358 PetscInt *AiLo, *AjLo; 359 PetscInt i,nz, nzLower, offset, rowOffset; 360 PetscErrorCode ierr; 361 cudaError_t cerr; 362 363 PetscFunctionBegin; 364 if (!n) PetscFunctionReturn(0); 365 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 366 try { 367 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 368 nzLower=n+ai[n]-ai[1]; 369 if (!loTriFactor) { 370 PetscScalar *AALo; 371 372 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 373 374 /* Allocate Space for the lower triangular matrix */ 375 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 376 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 377 378 /* Fill the lower triangular matrix */ 379 AiLo[0] = (PetscInt) 0; 380 AiLo[n] = nzLower; 381 AjLo[0] = (PetscInt) 0; 382 AALo[0] = (MatScalar) 1.0; 383 v = aa; 384 vi = aj; 385 offset = 1; 386 rowOffset= 1; 387 for (i=1; i<n; i++) { 388 nz = ai[i+1] - ai[i]; 389 /* additional 1 for the term on the diagonal */ 390 AiLo[i] = rowOffset; 391 rowOffset += nz+1; 392 393 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 394 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 395 396 offset += nz; 397 AjLo[offset] = (PetscInt) i; 398 AALo[offset] = (MatScalar) 1.0; 399 offset += 1; 400 401 v += nz; 402 vi += nz; 403 } 404 405 /* allocate space for the triangular factor information */ 406 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 407 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 408 /* Create the matrix description */ 409 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 410 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 411 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 412 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 413 #else 414 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 415 #endif 416 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 417 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 418 419 /* set the operation */ 420 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 421 422 /* set the matrix */ 423 loTriFactor->csrMat = new CsrMatrix; 424 loTriFactor->csrMat->num_rows = n; 425 loTriFactor->csrMat->num_cols = n; 426 loTriFactor->csrMat->num_entries = nzLower; 427 428 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 429 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 430 431 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 432 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 433 434 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 435 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 436 437 /* Create the solve analysis information */ 438 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 439 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 440 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 441 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 442 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 443 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 444 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 445 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 446 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 447 #endif 448 449 /* perform the solve analysis */ 450 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 451 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 452 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 453 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 454 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 455 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 456 #endif 457 );CHKERRCUSPARSE(stat); 458 cerr = WaitForCUDA();CHKERRCUDA(cerr); 459 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 460 461 /* assign the pointer */ 462 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 463 loTriFactor->AA_h = AALo; 464 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 465 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 466 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 467 } else { /* update values only */ 468 if (!loTriFactor->AA_h) { 469 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 470 } 471 /* Fill the lower triangular matrix */ 472 loTriFactor->AA_h[0] = 1.0; 473 v = aa; 474 vi = aj; 475 offset = 1; 476 for (i=1; i<n; i++) { 477 nz = ai[i+1] - ai[i]; 478 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 479 offset += nz; 480 loTriFactor->AA_h[offset] = 1.0; 481 offset += 1; 482 v += nz; 483 } 484 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 485 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 486 } 487 } catch(char *ex) { 488 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 489 } 490 } 491 PetscFunctionReturn(0); 492 } 493 494 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 495 { 496 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 497 PetscInt n = A->rmap->n; 498 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 499 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 500 cusparseStatus_t stat; 501 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 502 const MatScalar *aa = a->a,*v; 503 PetscInt *AiUp, *AjUp; 504 PetscInt i,nz, nzUpper, offset; 505 PetscErrorCode ierr; 506 cudaError_t cerr; 507 508 PetscFunctionBegin; 509 if (!n) PetscFunctionReturn(0); 510 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 511 try { 512 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 513 nzUpper = adiag[0]-adiag[n]; 514 if (!upTriFactor) { 515 PetscScalar *AAUp; 516 517 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 518 519 /* Allocate Space for the upper triangular matrix */ 520 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 521 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 522 523 /* Fill the upper triangular matrix */ 524 AiUp[0]=(PetscInt) 0; 525 AiUp[n]=nzUpper; 526 offset = nzUpper; 527 for (i=n-1; i>=0; i--) { 528 v = aa + adiag[i+1] + 1; 529 vi = aj + adiag[i+1] + 1; 530 531 /* number of elements NOT on the diagonal */ 532 nz = adiag[i] - adiag[i+1]-1; 533 534 /* decrement the offset */ 535 offset -= (nz+1); 536 537 /* first, set the diagonal elements */ 538 AjUp[offset] = (PetscInt) i; 539 AAUp[offset] = (MatScalar)1./v[nz]; 540 AiUp[i] = AiUp[i+1] - (nz+1); 541 542 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 543 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 544 } 545 546 /* allocate space for the triangular factor information */ 547 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 548 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 549 550 /* Create the matrix description */ 551 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 552 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 553 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 554 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 555 #else 556 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 557 #endif 558 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 559 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 560 561 /* set the operation */ 562 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 563 564 /* set the matrix */ 565 upTriFactor->csrMat = new CsrMatrix; 566 upTriFactor->csrMat->num_rows = n; 567 upTriFactor->csrMat->num_cols = n; 568 upTriFactor->csrMat->num_entries = nzUpper; 569 570 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 571 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 572 573 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 574 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 575 576 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 577 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 578 579 /* Create the solve analysis information */ 580 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 581 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 582 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 583 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 584 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 585 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 586 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 587 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 588 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 589 #endif 590 591 /* perform the solve analysis */ 592 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 593 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 594 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 595 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 596 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 597 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 598 #endif 599 );CHKERRCUSPARSE(stat); 600 cerr = WaitForCUDA();CHKERRCUDA(cerr); 601 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 602 603 /* assign the pointer */ 604 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 605 upTriFactor->AA_h = AAUp; 606 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 607 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 608 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 609 } else { 610 if (!upTriFactor->AA_h) { 611 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 612 } 613 /* Fill the upper triangular matrix */ 614 offset = nzUpper; 615 for (i=n-1; i>=0; i--) { 616 v = aa + adiag[i+1] + 1; 617 618 /* number of elements NOT on the diagonal */ 619 nz = adiag[i] - adiag[i+1]-1; 620 621 /* decrement the offset */ 622 offset -= (nz+1); 623 624 /* first, set the diagonal elements */ 625 upTriFactor->AA_h[offset] = 1./v[nz]; 626 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 627 } 628 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 629 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 630 } 631 } catch(char *ex) { 632 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 633 } 634 } 635 PetscFunctionReturn(0); 636 } 637 638 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 639 { 640 PetscErrorCode ierr; 641 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 642 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 643 IS isrow = a->row,iscol = a->icol; 644 PetscBool row_identity,col_identity; 645 PetscInt n = A->rmap->n; 646 647 PetscFunctionBegin; 648 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 649 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 650 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 651 652 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 653 cusparseTriFactors->nnz=a->nz; 654 655 A->offloadmask = PETSC_OFFLOAD_BOTH; 656 /* lower triangular indices */ 657 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 658 if (!row_identity && !cusparseTriFactors->rpermIndices) { 659 const PetscInt *r; 660 661 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 662 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 663 cusparseTriFactors->rpermIndices->assign(r, r+n); 664 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 665 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 666 } 667 668 /* upper triangular indices */ 669 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 670 if (!col_identity && !cusparseTriFactors->cpermIndices) { 671 const PetscInt *c; 672 673 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 674 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 675 cusparseTriFactors->cpermIndices->assign(c, c+n); 676 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 677 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 678 } 679 PetscFunctionReturn(0); 680 } 681 682 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 683 { 684 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 685 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 686 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 687 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 688 cusparseStatus_t stat; 689 PetscErrorCode ierr; 690 cudaError_t cerr; 691 PetscInt *AiUp, *AjUp; 692 PetscScalar *AAUp; 693 PetscScalar *AALo; 694 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 695 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 696 const PetscInt *ai = b->i,*aj = b->j,*vj; 697 const MatScalar *aa = b->a,*v; 698 699 PetscFunctionBegin; 700 if (!n) PetscFunctionReturn(0); 701 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 702 try { 703 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 704 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 705 if (!upTriFactor && !loTriFactor) { 706 /* Allocate Space for the upper triangular matrix */ 707 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 708 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 709 710 /* Fill the upper triangular matrix */ 711 AiUp[0]=(PetscInt) 0; 712 AiUp[n]=nzUpper; 713 offset = 0; 714 for (i=0; i<n; i++) { 715 /* set the pointers */ 716 v = aa + ai[i]; 717 vj = aj + ai[i]; 718 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 719 720 /* first, set the diagonal elements */ 721 AjUp[offset] = (PetscInt) i; 722 AAUp[offset] = (MatScalar)1.0/v[nz]; 723 AiUp[i] = offset; 724 AALo[offset] = (MatScalar)1.0/v[nz]; 725 726 offset+=1; 727 if (nz>0) { 728 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 729 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 730 for (j=offset; j<offset+nz; j++) { 731 AAUp[j] = -AAUp[j]; 732 AALo[j] = AAUp[j]/v[nz]; 733 } 734 offset+=nz; 735 } 736 } 737 738 /* allocate space for the triangular factor information */ 739 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 740 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 741 742 /* Create the matrix description */ 743 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 744 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 745 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 746 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 747 #else 748 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 749 #endif 750 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 751 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 752 753 /* set the matrix */ 754 upTriFactor->csrMat = new CsrMatrix; 755 upTriFactor->csrMat->num_rows = A->rmap->n; 756 upTriFactor->csrMat->num_cols = A->cmap->n; 757 upTriFactor->csrMat->num_entries = a->nz; 758 759 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 760 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 761 762 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 763 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 764 765 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 766 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 767 768 /* set the operation */ 769 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 770 771 /* Create the solve analysis information */ 772 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 773 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 774 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 775 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 776 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 777 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 778 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 779 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 780 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 781 #endif 782 783 /* perform the solve analysis */ 784 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 785 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 786 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 787 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 788 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 789 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 790 #endif 791 );CHKERRCUSPARSE(stat); 792 cerr = WaitForCUDA();CHKERRCUDA(cerr); 793 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 794 795 /* assign the pointer */ 796 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 797 798 /* allocate space for the triangular factor information */ 799 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 800 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 801 802 /* Create the matrix description */ 803 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 804 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 805 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 806 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 807 #else 808 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 809 #endif 810 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 811 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 812 813 /* set the operation */ 814 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 815 816 /* set the matrix */ 817 loTriFactor->csrMat = new CsrMatrix; 818 loTriFactor->csrMat->num_rows = A->rmap->n; 819 loTriFactor->csrMat->num_cols = A->cmap->n; 820 loTriFactor->csrMat->num_entries = a->nz; 821 822 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 823 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 824 825 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 826 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 827 828 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 829 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 830 831 /* Create the solve analysis information */ 832 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 833 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 834 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 835 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 836 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 837 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 838 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 839 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 840 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 841 #endif 842 843 /* perform the solve analysis */ 844 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 845 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 846 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 847 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 848 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 849 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 850 #endif 851 );CHKERRCUSPARSE(stat); 852 cerr = WaitForCUDA();CHKERRCUDA(cerr); 853 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 854 855 /* assign the pointer */ 856 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 857 858 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 859 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 860 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 861 } else { 862 /* Fill the upper triangular matrix */ 863 offset = 0; 864 for (i=0; i<n; i++) { 865 /* set the pointers */ 866 v = aa + ai[i]; 867 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 868 869 /* first, set the diagonal elements */ 870 AAUp[offset] = 1.0/v[nz]; 871 AALo[offset] = 1.0/v[nz]; 872 873 offset+=1; 874 if (nz>0) { 875 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 876 for (j=offset; j<offset+nz; j++) { 877 AAUp[j] = -AAUp[j]; 878 AALo[j] = AAUp[j]/v[nz]; 879 } 880 offset+=nz; 881 } 882 } 883 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 884 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 885 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 886 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 887 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 888 } 889 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 890 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 891 } catch(char *ex) { 892 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 893 } 894 } 895 PetscFunctionReturn(0); 896 } 897 898 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 899 { 900 PetscErrorCode ierr; 901 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 902 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 903 IS ip = a->row; 904 PetscBool perm_identity; 905 PetscInt n = A->rmap->n; 906 907 PetscFunctionBegin; 908 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 909 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 910 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 911 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 912 913 A->offloadmask = PETSC_OFFLOAD_BOTH; 914 915 /* lower triangular indices */ 916 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 917 if (!perm_identity) { 918 IS iip; 919 const PetscInt *irip,*rip; 920 921 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 922 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 923 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 924 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 925 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 926 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 927 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 928 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 929 ierr = ISDestroy(&iip);CHKERRQ(ierr); 930 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 931 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 932 } 933 PetscFunctionReturn(0); 934 } 935 936 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 937 { 938 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 939 IS isrow = b->row,iscol = b->col; 940 PetscBool row_identity,col_identity; 941 PetscErrorCode ierr; 942 943 PetscFunctionBegin; 944 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 945 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 946 B->offloadmask = PETSC_OFFLOAD_CPU; 947 /* determine which version of MatSolve needs to be used. */ 948 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 949 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 950 if (row_identity && col_identity) { 951 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 952 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 953 B->ops->matsolve = NULL; 954 B->ops->matsolvetranspose = NULL; 955 } else { 956 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 957 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 958 B->ops->matsolve = NULL; 959 B->ops->matsolvetranspose = NULL; 960 } 961 962 /* get the triangular factors */ 963 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 964 PetscFunctionReturn(0); 965 } 966 967 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 968 { 969 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 970 IS ip = b->row; 971 PetscBool perm_identity; 972 PetscErrorCode ierr; 973 974 PetscFunctionBegin; 975 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 976 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 977 B->offloadmask = PETSC_OFFLOAD_CPU; 978 /* determine which version of MatSolve needs to be used. */ 979 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 980 if (perm_identity) { 981 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 982 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 983 B->ops->matsolve = NULL; 984 B->ops->matsolvetranspose = NULL; 985 } else { 986 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 987 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 988 B->ops->matsolve = NULL; 989 B->ops->matsolvetranspose = NULL; 990 } 991 992 /* get the triangular factors */ 993 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 994 PetscFunctionReturn(0); 995 } 996 997 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 998 { 999 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1000 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1001 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1002 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1003 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1004 cusparseStatus_t stat; 1005 cusparseIndexBase_t indexBase; 1006 cusparseMatrixType_t matrixType; 1007 cusparseFillMode_t fillMode; 1008 cusparseDiagType_t diagType; 1009 cudaError_t cerr; 1010 PetscErrorCode ierr; 1011 1012 PetscFunctionBegin; 1013 /* allocate space for the transpose of the lower triangular factor */ 1014 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1015 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1016 1017 /* set the matrix descriptors of the lower triangular factor */ 1018 matrixType = cusparseGetMatType(loTriFactor->descr); 1019 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1020 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1021 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1022 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1023 1024 /* Create the matrix description */ 1025 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1026 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1027 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1028 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1029 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1030 1031 /* set the operation */ 1032 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1033 1034 /* allocate GPU space for the CSC of the lower triangular factor*/ 1035 loTriFactorT->csrMat = new CsrMatrix; 1036 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1037 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1038 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1039 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1040 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1041 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1042 1043 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1044 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1045 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1046 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1047 loTriFactor->csrMat->values->data().get(), 1048 loTriFactor->csrMat->row_offsets->data().get(), 1049 loTriFactor->csrMat->column_indices->data().get(), 1050 loTriFactorT->csrMat->values->data().get(), 1051 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1052 CUSPARSE_ACTION_NUMERIC,indexBase, 1053 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1054 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1055 #endif 1056 1057 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1058 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1059 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1060 loTriFactor->csrMat->values->data().get(), 1061 loTriFactor->csrMat->row_offsets->data().get(), 1062 loTriFactor->csrMat->column_indices->data().get(), 1063 loTriFactorT->csrMat->values->data().get(), 1064 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1065 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1066 CUSPARSE_ACTION_NUMERIC, indexBase, 1067 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 1068 #else 1069 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1070 CUSPARSE_ACTION_NUMERIC, indexBase 1071 #endif 1072 );CHKERRCUSPARSE(stat); 1073 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1074 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1075 1076 /* Create the solve analysis information */ 1077 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1078 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1079 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1080 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1081 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1082 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1083 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1084 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1085 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1086 #endif 1087 1088 /* perform the solve analysis */ 1089 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1090 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1091 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1092 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 1093 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1094 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1095 #endif 1096 );CHKERRCUSPARSE(stat); 1097 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1098 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1099 1100 /* assign the pointer */ 1101 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1102 1103 /*********************************************/ 1104 /* Now the Transpose of the Upper Tri Factor */ 1105 /*********************************************/ 1106 1107 /* allocate space for the transpose of the upper triangular factor */ 1108 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1109 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1110 1111 /* set the matrix descriptors of the upper triangular factor */ 1112 matrixType = cusparseGetMatType(upTriFactor->descr); 1113 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1114 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1115 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1116 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1117 1118 /* Create the matrix description */ 1119 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1120 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1121 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1122 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1123 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1124 1125 /* set the operation */ 1126 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1127 1128 /* allocate GPU space for the CSC of the upper triangular factor*/ 1129 upTriFactorT->csrMat = new CsrMatrix; 1130 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1131 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1132 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1133 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1134 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1135 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1136 1137 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1138 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1139 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1140 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1141 upTriFactor->csrMat->values->data().get(), 1142 upTriFactor->csrMat->row_offsets->data().get(), 1143 upTriFactor->csrMat->column_indices->data().get(), 1144 upTriFactorT->csrMat->values->data().get(), 1145 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1146 CUSPARSE_ACTION_NUMERIC,indexBase, 1147 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1148 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1149 #endif 1150 1151 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1152 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1153 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1154 upTriFactor->csrMat->values->data().get(), 1155 upTriFactor->csrMat->row_offsets->data().get(), 1156 upTriFactor->csrMat->column_indices->data().get(), 1157 upTriFactorT->csrMat->values->data().get(), 1158 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1159 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1160 CUSPARSE_ACTION_NUMERIC, indexBase, 1161 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1162 #else 1163 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1164 CUSPARSE_ACTION_NUMERIC, indexBase 1165 #endif 1166 );CHKERRCUSPARSE(stat); 1167 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1168 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1169 1170 /* Create the solve analysis information */ 1171 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1172 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1173 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1174 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1175 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1176 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1177 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1178 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1179 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1180 #endif 1181 1182 /* perform the solve analysis */ 1183 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1184 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1185 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1186 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1187 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1188 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1189 #endif 1190 );CHKERRCUSPARSE(stat); 1191 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1192 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1193 1194 /* assign the pointer */ 1195 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1196 PetscFunctionReturn(0); 1197 } 1198 1199 struct PetscScalarToPetscInt 1200 { 1201 __host__ __device__ 1202 PetscInt operator()(PetscScalar s) 1203 { 1204 return (PetscInt)PetscRealPart(s); 1205 } 1206 }; 1207 1208 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 1209 { 1210 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1211 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1212 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1213 cusparseStatus_t stat; 1214 cusparseIndexBase_t indexBase; 1215 cudaError_t err; 1216 PetscErrorCode ierr; 1217 1218 PetscFunctionBegin; 1219 if (!cusparsestruct->transgen || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 1220 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1221 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1222 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct"); 1223 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1224 if (cusparsestruct->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matTranspose struct"); 1225 if (cusparsestruct->transupdated) PetscFunctionReturn(0); 1226 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1227 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1228 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1229 } 1230 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1231 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1232 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1233 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1234 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1235 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1236 1237 /* set alpha and beta */ 1238 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1239 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1240 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1241 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1242 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1243 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1244 1245 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1246 CsrMatrix *matrixT = new CsrMatrix; 1247 matstructT->mat = matrixT; 1248 matrixT->num_rows = A->cmap->n; 1249 matrixT->num_cols = A->rmap->n; 1250 matrixT->num_entries = a->nz; 1251 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1252 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1253 matrixT->values = new THRUSTARRAY(a->nz); 1254 1255 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1256 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1257 1258 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1259 stat = cusparseCreateCsr(&matstructT->matDescr, 1260 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1261 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1262 matrixT->values->data().get(), 1263 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1264 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1265 #endif 1266 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1267 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1268 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1269 #else 1270 CsrMatrix *temp = new CsrMatrix; 1271 CsrMatrix *tempT = new CsrMatrix; 1272 /* First convert HYB to CSR */ 1273 temp->num_rows = A->rmap->n; 1274 temp->num_cols = A->cmap->n; 1275 temp->num_entries = a->nz; 1276 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1277 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1278 temp->values = new THRUSTARRAY(a->nz); 1279 1280 stat = cusparse_hyb2csr(cusparsestruct->handle, 1281 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1282 temp->values->data().get(), 1283 temp->row_offsets->data().get(), 1284 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1285 1286 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1287 tempT->num_rows = A->rmap->n; 1288 tempT->num_cols = A->cmap->n; 1289 tempT->num_entries = a->nz; 1290 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1291 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1292 tempT->values = new THRUSTARRAY(a->nz); 1293 1294 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1295 temp->num_cols, temp->num_entries, 1296 temp->values->data().get(), 1297 temp->row_offsets->data().get(), 1298 temp->column_indices->data().get(), 1299 tempT->values->data().get(), 1300 tempT->column_indices->data().get(), 1301 tempT->row_offsets->data().get(), 1302 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1303 1304 /* Last, convert CSC to HYB */ 1305 cusparseHybMat_t hybMat; 1306 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1307 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1308 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1309 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1310 matstructT->descr, tempT->values->data().get(), 1311 tempT->row_offsets->data().get(), 1312 tempT->column_indices->data().get(), 1313 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1314 1315 /* assign the pointer */ 1316 matstructT->mat = hybMat; 1317 cusparsestruct->transupdated = PETSC_TRUE; 1318 /* delete temporaries */ 1319 if (tempT) { 1320 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1321 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1322 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1323 delete (CsrMatrix*) tempT; 1324 } 1325 if (temp) { 1326 if (temp->values) delete (THRUSTARRAY*) temp->values; 1327 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1328 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1329 delete (CsrMatrix*) temp; 1330 } 1331 #endif 1332 } 1333 } 1334 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1335 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1336 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1337 if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix"); 1338 if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix rows"); 1339 if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix cols"); 1340 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix values"); 1341 if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT"); 1342 if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT rows"); 1343 if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT cols"); 1344 if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT values"); 1345 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1346 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1347 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1348 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1349 } 1350 if (!cusparsestruct->csr2csc_i) { 1351 THRUSTARRAY csr2csc_a(matrix->num_entries); 1352 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1353 1354 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1355 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1356 void *csr2cscBuffer; 1357 size_t csr2cscBufferSize; 1358 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1359 A->cmap->n, matrix->num_entries, 1360 matrix->values->data().get(), 1361 cusparsestruct->rowoffsets_gpu->data().get(), 1362 matrix->column_indices->data().get(), 1363 matrixT->values->data().get(), 1364 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1365 CUSPARSE_ACTION_NUMERIC,indexBase, 1366 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1367 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1368 #endif 1369 1370 stat = cusparse_csr2csc(cusparsestruct->handle, 1371 A->rmap->n,A->cmap->n,matrix->num_entries, 1372 csr2csc_a.data().get(),cusparsestruct->rowoffsets_gpu->data().get(),matrix->column_indices->data().get(), 1373 matrixT->values->data().get(), 1374 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1375 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1376 CUSPARSE_ACTION_NUMERIC,indexBase, 1377 cusparsestruct->csr2cscAlg, csr2cscBuffer 1378 #else 1379 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1380 CUSPARSE_ACTION_NUMERIC, indexBase 1381 #endif 1382 );CHKERRCUSPARSE(stat); 1383 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1384 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1385 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1386 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1387 #endif 1388 } 1389 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1390 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1391 matrixT->values->begin())); 1392 } 1393 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1394 /* the compressed row indices is not used for matTranspose */ 1395 matstructT->cprowIndices = NULL; 1396 /* assign the pointer */ 1397 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1398 cusparsestruct->transupdated = PETSC_TRUE; 1399 PetscFunctionReturn(0); 1400 } 1401 1402 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1403 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1404 { 1405 PetscInt n = xx->map->n; 1406 const PetscScalar *barray; 1407 PetscScalar *xarray; 1408 thrust::device_ptr<const PetscScalar> bGPU; 1409 thrust::device_ptr<PetscScalar> xGPU; 1410 cusparseStatus_t stat; 1411 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1412 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1413 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1414 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1415 PetscErrorCode ierr; 1416 cudaError_t cerr; 1417 1418 PetscFunctionBegin; 1419 /* Analyze the matrix and create the transpose ... on the fly */ 1420 if (!loTriFactorT && !upTriFactorT) { 1421 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1422 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1423 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1424 } 1425 1426 /* Get the GPU pointers */ 1427 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1428 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1429 xGPU = thrust::device_pointer_cast(xarray); 1430 bGPU = thrust::device_pointer_cast(barray); 1431 1432 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1433 /* First, reorder with the row permutation */ 1434 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1435 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1436 xGPU); 1437 1438 /* First, solve U */ 1439 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1440 upTriFactorT->csrMat->num_rows, 1441 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1442 upTriFactorT->csrMat->num_entries, 1443 #endif 1444 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1445 upTriFactorT->csrMat->values->data().get(), 1446 upTriFactorT->csrMat->row_offsets->data().get(), 1447 upTriFactorT->csrMat->column_indices->data().get(), 1448 upTriFactorT->solveInfo, 1449 xarray, tempGPU->data().get() 1450 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1451 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1452 #endif 1453 );CHKERRCUSPARSE(stat); 1454 1455 /* Then, solve L */ 1456 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1457 loTriFactorT->csrMat->num_rows, 1458 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1459 loTriFactorT->csrMat->num_entries, 1460 #endif 1461 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1462 loTriFactorT->csrMat->values->data().get(), 1463 loTriFactorT->csrMat->row_offsets->data().get(), 1464 loTriFactorT->csrMat->column_indices->data().get(), 1465 loTriFactorT->solveInfo, 1466 tempGPU->data().get(), xarray 1467 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1468 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1469 #endif 1470 );CHKERRCUSPARSE(stat); 1471 1472 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1473 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1474 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1475 tempGPU->begin()); 1476 1477 /* Copy the temporary to the full solution. */ 1478 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1479 1480 /* restore */ 1481 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1482 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1483 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1484 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1485 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1486 PetscFunctionReturn(0); 1487 } 1488 1489 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1490 { 1491 const PetscScalar *barray; 1492 PetscScalar *xarray; 1493 cusparseStatus_t stat; 1494 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1495 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1496 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1497 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1498 PetscErrorCode ierr; 1499 cudaError_t cerr; 1500 1501 PetscFunctionBegin; 1502 /* Analyze the matrix and create the transpose ... on the fly */ 1503 if (!loTriFactorT && !upTriFactorT) { 1504 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1505 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1506 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1507 } 1508 1509 /* Get the GPU pointers */ 1510 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1511 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1512 1513 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1514 /* First, solve U */ 1515 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1516 upTriFactorT->csrMat->num_rows, 1517 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1518 upTriFactorT->csrMat->num_entries, 1519 #endif 1520 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1521 upTriFactorT->csrMat->values->data().get(), 1522 upTriFactorT->csrMat->row_offsets->data().get(), 1523 upTriFactorT->csrMat->column_indices->data().get(), 1524 upTriFactorT->solveInfo, 1525 barray, tempGPU->data().get() 1526 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1527 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1528 #endif 1529 );CHKERRCUSPARSE(stat); 1530 1531 /* Then, solve L */ 1532 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1533 loTriFactorT->csrMat->num_rows, 1534 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1535 loTriFactorT->csrMat->num_entries, 1536 #endif 1537 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1538 loTriFactorT->csrMat->values->data().get(), 1539 loTriFactorT->csrMat->row_offsets->data().get(), 1540 loTriFactorT->csrMat->column_indices->data().get(), 1541 loTriFactorT->solveInfo, 1542 tempGPU->data().get(), xarray 1543 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1544 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1545 #endif 1546 );CHKERRCUSPARSE(stat); 1547 1548 /* restore */ 1549 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1550 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1551 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1552 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1553 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1554 PetscFunctionReturn(0); 1555 } 1556 1557 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1558 { 1559 const PetscScalar *barray; 1560 PetscScalar *xarray; 1561 thrust::device_ptr<const PetscScalar> bGPU; 1562 thrust::device_ptr<PetscScalar> xGPU; 1563 cusparseStatus_t stat; 1564 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1565 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1566 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1567 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1568 PetscErrorCode ierr; 1569 cudaError_t cerr; 1570 1571 PetscFunctionBegin; 1572 1573 /* Get the GPU pointers */ 1574 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1575 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1576 xGPU = thrust::device_pointer_cast(xarray); 1577 bGPU = thrust::device_pointer_cast(barray); 1578 1579 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1580 /* First, reorder with the row permutation */ 1581 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1582 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1583 tempGPU->begin()); 1584 1585 /* Next, solve L */ 1586 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1587 loTriFactor->csrMat->num_rows, 1588 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1589 loTriFactor->csrMat->num_entries, 1590 #endif 1591 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1592 loTriFactor->csrMat->values->data().get(), 1593 loTriFactor->csrMat->row_offsets->data().get(), 1594 loTriFactor->csrMat->column_indices->data().get(), 1595 loTriFactor->solveInfo, 1596 tempGPU->data().get(), xarray 1597 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1598 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1599 #endif 1600 );CHKERRCUSPARSE(stat); 1601 1602 /* Then, solve U */ 1603 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1604 upTriFactor->csrMat->num_rows, 1605 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1606 upTriFactor->csrMat->num_entries, 1607 #endif 1608 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1609 upTriFactor->csrMat->values->data().get(), 1610 upTriFactor->csrMat->row_offsets->data().get(), 1611 upTriFactor->csrMat->column_indices->data().get(), 1612 upTriFactor->solveInfo, 1613 xarray, tempGPU->data().get() 1614 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1615 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1616 #endif 1617 );CHKERRCUSPARSE(stat); 1618 1619 /* Last, reorder with the column permutation */ 1620 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1621 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1622 xGPU); 1623 1624 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1625 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1626 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1627 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1628 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1629 PetscFunctionReturn(0); 1630 } 1631 1632 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1633 { 1634 const PetscScalar *barray; 1635 PetscScalar *xarray; 1636 cusparseStatus_t stat; 1637 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1638 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1639 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1640 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1641 PetscErrorCode ierr; 1642 cudaError_t cerr; 1643 1644 PetscFunctionBegin; 1645 /* Get the GPU pointers */ 1646 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1647 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1648 1649 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1650 /* First, solve L */ 1651 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1652 loTriFactor->csrMat->num_rows, 1653 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1654 loTriFactor->csrMat->num_entries, 1655 #endif 1656 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1657 loTriFactor->csrMat->values->data().get(), 1658 loTriFactor->csrMat->row_offsets->data().get(), 1659 loTriFactor->csrMat->column_indices->data().get(), 1660 loTriFactor->solveInfo, 1661 barray, tempGPU->data().get() 1662 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1663 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1664 #endif 1665 );CHKERRCUSPARSE(stat); 1666 1667 /* Next, solve U */ 1668 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1669 upTriFactor->csrMat->num_rows, 1670 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1671 upTriFactor->csrMat->num_entries, 1672 #endif 1673 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1674 upTriFactor->csrMat->values->data().get(), 1675 upTriFactor->csrMat->row_offsets->data().get(), 1676 upTriFactor->csrMat->column_indices->data().get(), 1677 upTriFactor->solveInfo, 1678 tempGPU->data().get(), xarray 1679 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1680 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1681 #endif 1682 );CHKERRCUSPARSE(stat); 1683 1684 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1685 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1686 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1687 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1688 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1689 PetscFunctionReturn(0); 1690 } 1691 1692 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1693 { 1694 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1695 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1696 cudaError_t cerr; 1697 PetscErrorCode ierr; 1698 1699 PetscFunctionBegin; 1700 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1701 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1702 1703 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1704 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1705 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1706 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1707 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1708 A->offloadmask = PETSC_OFFLOAD_BOTH; 1709 } 1710 PetscFunctionReturn(0); 1711 } 1712 1713 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1714 { 1715 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1716 PetscErrorCode ierr; 1717 1718 PetscFunctionBegin; 1719 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1720 *array = a->a; 1721 A->offloadmask = PETSC_OFFLOAD_CPU; 1722 PetscFunctionReturn(0); 1723 } 1724 1725 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1726 { 1727 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1728 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1729 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1730 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1731 PetscErrorCode ierr; 1732 cusparseStatus_t stat; 1733 PetscBool both = PETSC_TRUE; 1734 cudaError_t err; 1735 1736 PetscFunctionBegin; 1737 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU"); 1738 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1739 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1740 CsrMatrix *matrix; 1741 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1742 1743 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values"); 1744 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1745 matrix->values->assign(a->a, a->a+a->nz); 1746 err = WaitForCUDA();CHKERRCUDA(err); 1747 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1748 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1749 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1750 } else { 1751 PetscInt nnz; 1752 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1753 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1754 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1755 delete cusparsestruct->workVector; 1756 delete cusparsestruct->rowoffsets_gpu; 1757 cusparsestruct->workVector = NULL; 1758 cusparsestruct->rowoffsets_gpu = NULL; 1759 try { 1760 if (a->compressedrow.use) { 1761 m = a->compressedrow.nrows; 1762 ii = a->compressedrow.i; 1763 ridx = a->compressedrow.rindex; 1764 } else { 1765 m = A->rmap->n; 1766 ii = a->i; 1767 ridx = NULL; 1768 } 1769 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data"); 1770 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data"); 1771 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1772 else nnz = a->nz; 1773 1774 /* create cusparse matrix */ 1775 cusparsestruct->nrows = m; 1776 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1777 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1778 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1779 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1780 1781 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1782 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1783 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1784 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1785 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1786 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1787 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1788 1789 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1790 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1791 /* set the matrix */ 1792 CsrMatrix *mat= new CsrMatrix; 1793 mat->num_rows = m; 1794 mat->num_cols = A->cmap->n; 1795 mat->num_entries = nnz; 1796 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1797 mat->row_offsets->assign(ii, ii + m+1); 1798 1799 mat->column_indices = new THRUSTINTARRAY32(nnz); 1800 mat->column_indices->assign(a->j, a->j+nnz); 1801 1802 mat->values = new THRUSTARRAY(nnz); 1803 if (a->a) mat->values->assign(a->a, a->a+nnz); 1804 1805 /* assign the pointer */ 1806 matstruct->mat = mat; 1807 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1808 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1809 stat = cusparseCreateCsr(&matstruct->matDescr, 1810 mat->num_rows, mat->num_cols, mat->num_entries, 1811 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1812 mat->values->data().get(), 1813 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1814 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1815 } 1816 #endif 1817 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1818 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1819 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1820 #else 1821 CsrMatrix *mat= new CsrMatrix; 1822 mat->num_rows = m; 1823 mat->num_cols = A->cmap->n; 1824 mat->num_entries = nnz; 1825 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1826 mat->row_offsets->assign(ii, ii + m+1); 1827 1828 mat->column_indices = new THRUSTINTARRAY32(nnz); 1829 mat->column_indices->assign(a->j, a->j+nnz); 1830 1831 mat->values = new THRUSTARRAY(nnz); 1832 if (a->a) mat->values->assign(a->a, a->a+nnz); 1833 1834 cusparseHybMat_t hybMat; 1835 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1836 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1837 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1838 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1839 matstruct->descr, mat->values->data().get(), 1840 mat->row_offsets->data().get(), 1841 mat->column_indices->data().get(), 1842 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1843 /* assign the pointer */ 1844 matstruct->mat = hybMat; 1845 1846 if (mat) { 1847 if (mat->values) delete (THRUSTARRAY*)mat->values; 1848 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1849 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1850 delete (CsrMatrix*)mat; 1851 } 1852 #endif 1853 } 1854 1855 /* assign the compressed row indices */ 1856 if (a->compressedrow.use) { 1857 cusparsestruct->workVector = new THRUSTARRAY(m); 1858 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1859 matstruct->cprowIndices->assign(ridx,ridx+m); 1860 tmp = m; 1861 } else { 1862 cusparsestruct->workVector = NULL; 1863 matstruct->cprowIndices = NULL; 1864 tmp = 0; 1865 } 1866 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1867 1868 /* assign the pointer */ 1869 cusparsestruct->mat = matstruct; 1870 } catch(char *ex) { 1871 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1872 } 1873 err = WaitForCUDA();CHKERRCUDA(err); 1874 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1875 cusparsestruct->nonzerostate = A->nonzerostate; 1876 } 1877 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1878 } 1879 PetscFunctionReturn(0); 1880 } 1881 1882 struct VecCUDAPlusEquals 1883 { 1884 template <typename Tuple> 1885 __host__ __device__ 1886 void operator()(Tuple t) 1887 { 1888 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1889 } 1890 }; 1891 1892 struct VecCUDAEquals 1893 { 1894 template <typename Tuple> 1895 __host__ __device__ 1896 void operator()(Tuple t) 1897 { 1898 thrust::get<1>(t) = thrust::get<0>(t); 1899 } 1900 }; 1901 1902 struct VecCUDAEqualsReverse 1903 { 1904 template <typename Tuple> 1905 __host__ __device__ 1906 void operator()(Tuple t) 1907 { 1908 thrust::get<0>(t) = thrust::get<1>(t); 1909 } 1910 }; 1911 1912 struct MatMatCusparse { 1913 PetscBool cisdense; 1914 PetscScalar *Bt; 1915 Mat X; 1916 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1917 PetscLogDouble flops; 1918 CsrMatrix *Bcsr; 1919 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1920 cusparseSpMatDescr_t matSpBDescr; 1921 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1922 cusparseDnMatDescr_t matBDescr; 1923 cusparseDnMatDescr_t matCDescr; 1924 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1925 size_t mmBufferSize; 1926 void *mmBuffer; 1927 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1928 cusparseSpGEMMDescr_t spgemmDesc; 1929 #endif 1930 }; 1931 1932 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1933 { 1934 PetscErrorCode ierr; 1935 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1936 cudaError_t cerr; 1937 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1938 cusparseStatus_t stat; 1939 #endif 1940 1941 PetscFunctionBegin; 1942 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1943 delete mmdata->Bcsr; 1944 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1945 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1946 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1947 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1948 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1949 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1950 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1951 #endif 1952 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1953 ierr = PetscFree(data);CHKERRQ(ierr); 1954 PetscFunctionReturn(0); 1955 } 1956 1957 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1958 1959 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1960 { 1961 Mat_Product *product = C->product; 1962 Mat A,B; 1963 PetscInt m,n,blda,clda; 1964 PetscBool flg,biscuda; 1965 Mat_SeqAIJCUSPARSE *cusp; 1966 cusparseStatus_t stat; 1967 cusparseOperation_t opA; 1968 const PetscScalar *barray; 1969 PetscScalar *carray; 1970 PetscErrorCode ierr; 1971 MatMatCusparse *mmdata; 1972 Mat_SeqAIJCUSPARSEMultStruct *mat; 1973 CsrMatrix *csrmat; 1974 cudaError_t cerr; 1975 1976 PetscFunctionBegin; 1977 MatCheckProduct(C,1); 1978 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1979 mmdata = (MatMatCusparse*)product->data; 1980 A = product->A; 1981 B = product->B; 1982 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1983 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1984 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1985 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1986 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1987 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1988 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1989 switch (product->type) { 1990 case MATPRODUCT_AB: 1991 case MATPRODUCT_PtAP: 1992 mat = cusp->mat; 1993 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1994 m = A->rmap->n; 1995 n = B->cmap->n; 1996 break; 1997 case MATPRODUCT_AtB: 1998 if (!cusp->transgen) { 1999 mat = cusp->mat; 2000 opA = CUSPARSE_OPERATION_TRANSPOSE; 2001 } else { 2002 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 2003 mat = cusp->matTranspose; 2004 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2005 } 2006 m = A->cmap->n; 2007 n = B->cmap->n; 2008 break; 2009 case MATPRODUCT_ABt: 2010 case MATPRODUCT_RARt: 2011 mat = cusp->mat; 2012 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2013 m = A->rmap->n; 2014 n = B->rmap->n; 2015 break; 2016 default: 2017 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2018 } 2019 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2020 csrmat = (CsrMatrix*)mat->mat; 2021 /* if the user passed a CPU matrix, copy the data to the GPU */ 2022 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2023 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2024 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2025 2026 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2027 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2028 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2029 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2030 } else { 2031 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2032 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2033 } 2034 2035 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2036 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2037 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2038 /* (re)allcoate mmBuffer if not initialized or LDAs are different */ 2039 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2040 size_t mmBufferSize; 2041 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2042 if (!mmdata->matBDescr) { 2043 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2044 mmdata->Blda = blda; 2045 } 2046 2047 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2048 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2049 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2050 mmdata->Clda = clda; 2051 } 2052 2053 if (!mat->matDescr) { 2054 stat = cusparseCreateCsr(&mat->matDescr, 2055 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2056 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2057 csrmat->values->data().get(), 2058 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2059 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2060 } 2061 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2062 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2063 mmdata->matCDescr,cusparse_scalartype, 2064 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2065 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2066 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2067 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2068 mmdata->mmBufferSize = mmBufferSize; 2069 } 2070 mmdata->initialized = PETSC_TRUE; 2071 } else { 2072 /* to be safe, always update pointers of the mats */ 2073 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2074 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2075 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2076 } 2077 2078 /* do cusparseSpMM, which supports transpose on B */ 2079 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2080 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2081 mmdata->matCDescr,cusparse_scalartype, 2082 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2083 #else 2084 PetscInt k; 2085 /* cusparseXcsrmm does not support transpose on B */ 2086 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2087 cublasHandle_t cublasv2handle; 2088 cublasStatus_t cerr; 2089 2090 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2091 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2092 B->cmap->n,B->rmap->n, 2093 &PETSC_CUSPARSE_ONE ,barray,blda, 2094 &PETSC_CUSPARSE_ZERO,barray,blda, 2095 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2096 blda = B->cmap->n; 2097 k = B->cmap->n; 2098 } else { 2099 k = B->rmap->n; 2100 } 2101 2102 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2103 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2104 csrmat->num_entries,mat->alpha_one,mat->descr, 2105 csrmat->values->data().get(), 2106 csrmat->row_offsets->data().get(), 2107 csrmat->column_indices->data().get(), 2108 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2109 carray,clda);CHKERRCUSPARSE(stat); 2110 #endif 2111 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2112 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2113 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2114 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2115 if (product->type == MATPRODUCT_RARt) { 2116 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2117 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2118 } else if (product->type == MATPRODUCT_PtAP) { 2119 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2120 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2121 } else { 2122 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2123 } 2124 if (mmdata->cisdense) { 2125 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2126 } 2127 if (!biscuda) { 2128 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2129 } 2130 PetscFunctionReturn(0); 2131 } 2132 2133 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2134 { 2135 Mat_Product *product = C->product; 2136 Mat A,B; 2137 PetscInt m,n; 2138 PetscBool cisdense,flg; 2139 PetscErrorCode ierr; 2140 MatMatCusparse *mmdata; 2141 Mat_SeqAIJCUSPARSE *cusp; 2142 2143 PetscFunctionBegin; 2144 MatCheckProduct(C,1); 2145 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2146 A = product->A; 2147 B = product->B; 2148 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2149 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2150 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2151 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2152 switch (product->type) { 2153 case MATPRODUCT_AB: 2154 m = A->rmap->n; 2155 n = B->cmap->n; 2156 break; 2157 case MATPRODUCT_AtB: 2158 m = A->cmap->n; 2159 n = B->cmap->n; 2160 break; 2161 case MATPRODUCT_ABt: 2162 m = A->rmap->n; 2163 n = B->rmap->n; 2164 break; 2165 case MATPRODUCT_PtAP: 2166 m = B->cmap->n; 2167 n = B->cmap->n; 2168 break; 2169 case MATPRODUCT_RARt: 2170 m = B->rmap->n; 2171 n = B->rmap->n; 2172 break; 2173 default: 2174 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2175 } 2176 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2177 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2178 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2179 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2180 2181 /* product data */ 2182 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2183 mmdata->cisdense = cisdense; 2184 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2185 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2186 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2187 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2188 } 2189 #endif 2190 /* for these products we need intermediate storage */ 2191 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2192 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2193 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2194 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2195 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2196 } else { 2197 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2198 } 2199 } 2200 C->product->data = mmdata; 2201 C->product->destroy = MatDestroy_MatMatCusparse; 2202 2203 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2204 PetscFunctionReturn(0); 2205 } 2206 2207 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2208 { 2209 Mat_Product *product = C->product; 2210 Mat A,B; 2211 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2212 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2213 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2214 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2215 PetscBool flg; 2216 PetscErrorCode ierr; 2217 cusparseStatus_t stat; 2218 cudaError_t cerr; 2219 MatProductType ptype; 2220 MatMatCusparse *mmdata; 2221 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2222 cusparseSpMatDescr_t BmatSpDescr; 2223 #endif 2224 2225 PetscFunctionBegin; 2226 MatCheckProduct(C,1); 2227 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 2228 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2229 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name); 2230 mmdata = (MatMatCusparse*)C->product->data; 2231 A = product->A; 2232 B = product->B; 2233 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2234 mmdata->reusesym = PETSC_FALSE; 2235 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2236 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2237 Cmat = Ccusp->mat; 2238 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2239 Ccsr = (CsrMatrix*)Cmat->mat; 2240 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2241 goto finalize; 2242 } 2243 if (!c->nz) goto finalize; 2244 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2245 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2246 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2247 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2248 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2249 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2250 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2251 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2252 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2253 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2254 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2255 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2256 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2257 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2258 2259 ptype = product->type; 2260 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2261 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2262 switch (ptype) { 2263 case MATPRODUCT_AB: 2264 Amat = Acusp->mat; 2265 Bmat = Bcusp->mat; 2266 break; 2267 case MATPRODUCT_AtB: 2268 Amat = Acusp->matTranspose; 2269 Bmat = Bcusp->mat; 2270 break; 2271 case MATPRODUCT_ABt: 2272 Amat = Acusp->mat; 2273 Bmat = Bcusp->matTranspose; 2274 break; 2275 default: 2276 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2277 } 2278 Cmat = Ccusp->mat; 2279 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2280 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2281 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2282 Acsr = (CsrMatrix*)Amat->mat; 2283 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2284 Ccsr = (CsrMatrix*)Cmat->mat; 2285 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2286 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2287 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2288 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2289 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2290 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2291 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2292 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2293 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2294 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2295 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2296 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2297 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2298 #else 2299 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2300 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2301 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2302 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2303 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2304 #endif 2305 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2306 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2307 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2308 C->offloadmask = PETSC_OFFLOAD_GPU; 2309 finalize: 2310 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2311 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2312 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2313 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2314 c->reallocs = 0; 2315 C->info.mallocs += 0; 2316 C->info.nz_unneeded = 0; 2317 C->assembled = C->was_assembled = PETSC_TRUE; 2318 C->num_ass++; 2319 PetscFunctionReturn(0); 2320 } 2321 2322 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2323 { 2324 Mat_Product *product = C->product; 2325 Mat A,B; 2326 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2327 Mat_SeqAIJ *a,*b,*c; 2328 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2329 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2330 PetscInt i,j,m,n,k; 2331 PetscBool flg; 2332 PetscErrorCode ierr; 2333 cusparseStatus_t stat; 2334 cudaError_t cerr; 2335 MatProductType ptype; 2336 MatMatCusparse *mmdata; 2337 PetscLogDouble flops; 2338 PetscBool biscompressed,ciscompressed; 2339 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2340 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2341 size_t bufSize2; 2342 cusparseSpMatDescr_t BmatSpDescr; 2343 #else 2344 int cnz; 2345 #endif 2346 2347 PetscFunctionBegin; 2348 MatCheckProduct(C,1); 2349 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2350 A = product->A; 2351 B = product->B; 2352 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2353 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2354 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2355 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2356 a = (Mat_SeqAIJ*)A->data; 2357 b = (Mat_SeqAIJ*)B->data; 2358 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2359 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2360 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2361 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2362 2363 /* product data */ 2364 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2365 C->product->data = mmdata; 2366 C->product->destroy = MatDestroy_MatMatCusparse; 2367 2368 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2369 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2370 ptype = product->type; 2371 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2372 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2373 biscompressed = PETSC_FALSE; 2374 ciscompressed = PETSC_FALSE; 2375 switch (ptype) { 2376 case MATPRODUCT_AB: 2377 m = A->rmap->n; 2378 n = B->cmap->n; 2379 k = A->cmap->n; 2380 Amat = Acusp->mat; 2381 Bmat = Bcusp->mat; 2382 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2383 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2384 break; 2385 case MATPRODUCT_AtB: 2386 m = A->cmap->n; 2387 n = B->cmap->n; 2388 k = A->rmap->n; 2389 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 2390 Amat = Acusp->matTranspose; 2391 Bmat = Bcusp->mat; 2392 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2393 break; 2394 case MATPRODUCT_ABt: 2395 m = A->rmap->n; 2396 n = B->rmap->n; 2397 k = A->cmap->n; 2398 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr); 2399 Amat = Acusp->mat; 2400 Bmat = Bcusp->matTranspose; 2401 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2402 break; 2403 default: 2404 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2405 } 2406 2407 /* create cusparse matrix */ 2408 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2409 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2410 c = (Mat_SeqAIJ*)C->data; 2411 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2412 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2413 Ccsr = new CsrMatrix; 2414 2415 c->compressedrow.use = ciscompressed; 2416 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2417 c->compressedrow.nrows = a->compressedrow.nrows; 2418 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2419 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2420 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2421 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2422 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2423 } else { 2424 c->compressedrow.nrows = 0; 2425 c->compressedrow.i = NULL; 2426 c->compressedrow.rindex = NULL; 2427 Ccusp->workVector = NULL; 2428 Cmat->cprowIndices = NULL; 2429 } 2430 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2431 Ccusp->mat = Cmat; 2432 Ccusp->mat->mat = Ccsr; 2433 Ccsr->num_rows = Ccusp->nrows; 2434 Ccsr->num_cols = n; 2435 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2436 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2437 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2438 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2439 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2440 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2441 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2442 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2443 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2444 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2445 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2446 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2447 c->nz = 0; 2448 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2449 Ccsr->values = new THRUSTARRAY(c->nz); 2450 goto finalizesym; 2451 } 2452 2453 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2454 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2455 Acsr = (CsrMatrix*)Amat->mat; 2456 if (!biscompressed) { 2457 Bcsr = (CsrMatrix*)Bmat->mat; 2458 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2459 BmatSpDescr = Bmat->matDescr; 2460 #endif 2461 } else { /* we need to use row offsets for the full matrix */ 2462 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2463 Bcsr = new CsrMatrix; 2464 Bcsr->num_rows = B->rmap->n; 2465 Bcsr->num_cols = cBcsr->num_cols; 2466 Bcsr->num_entries = cBcsr->num_entries; 2467 Bcsr->column_indices = cBcsr->column_indices; 2468 Bcsr->values = cBcsr->values; 2469 if (!Bcusp->rowoffsets_gpu) { 2470 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2471 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2472 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2473 } 2474 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2475 mmdata->Bcsr = Bcsr; 2476 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2477 if (Bcsr->num_rows && Bcsr->num_cols) { 2478 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2479 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2480 Bcsr->values->data().get(), 2481 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2482 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2483 } 2484 BmatSpDescr = mmdata->matSpBDescr; 2485 #endif 2486 } 2487 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2488 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2489 /* precompute flops count */ 2490 if (ptype == MATPRODUCT_AB) { 2491 for (i=0, flops = 0; i<A->rmap->n; i++) { 2492 const PetscInt st = a->i[i]; 2493 const PetscInt en = a->i[i+1]; 2494 for (j=st; j<en; j++) { 2495 const PetscInt brow = a->j[j]; 2496 flops += 2.*(b->i[brow+1] - b->i[brow]); 2497 } 2498 } 2499 } else if (ptype == MATPRODUCT_AtB) { 2500 for (i=0, flops = 0; i<A->rmap->n; i++) { 2501 const PetscInt anzi = a->i[i+1] - a->i[i]; 2502 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2503 flops += (2.*anzi)*bnzi; 2504 } 2505 } else { /* TODO */ 2506 flops = 0.; 2507 } 2508 2509 mmdata->flops = flops; 2510 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2511 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2512 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2513 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2514 NULL, NULL, NULL, 2515 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2516 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2517 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2518 /* ask bufferSize bytes for external memory */ 2519 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2520 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2521 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2522 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2523 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2524 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2525 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2526 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2527 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2528 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2529 /* ask bufferSize again bytes for external memory */ 2530 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2531 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2532 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2533 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2534 /* The CUSPARSE documentation is not clear, nor the API 2535 We need both buffers to perform the operations properly! 2536 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2537 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2538 is stored in the descriptor! What a messy API... */ 2539 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2540 /* compute the intermediate product of A * B */ 2541 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2542 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2543 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2544 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2545 /* get matrix C non-zero entries C_nnz1 */ 2546 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2547 c->nz = (PetscInt) C_nnz1; 2548 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2549 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2550 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2551 Ccsr->values = new THRUSTARRAY(c->nz); 2552 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2553 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2554 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2555 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2556 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2557 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2558 #else 2559 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2560 stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2561 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2562 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2563 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2564 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2565 c->nz = cnz; 2566 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2567 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2568 Ccsr->values = new THRUSTARRAY(c->nz); 2569 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2570 2571 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2572 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2573 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2574 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2575 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2576 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2577 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2578 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2579 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2580 #endif 2581 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2582 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2583 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2584 finalizesym: 2585 c->singlemalloc = PETSC_FALSE; 2586 c->free_a = PETSC_TRUE; 2587 c->free_ij = PETSC_TRUE; 2588 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2589 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2590 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2591 PetscInt *d_i = c->i; 2592 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2593 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2594 ii = *Ccsr->row_offsets; 2595 jj = *Ccsr->column_indices; 2596 if (ciscompressed) d_i = c->compressedrow.i; 2597 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2598 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2599 } else { 2600 PetscInt *d_i = c->i; 2601 if (ciscompressed) d_i = c->compressedrow.i; 2602 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2603 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2604 } 2605 if (ciscompressed) { /* need to expand host row offsets */ 2606 PetscInt r = 0; 2607 c->i[0] = 0; 2608 for (k = 0; k < c->compressedrow.nrows; k++) { 2609 const PetscInt next = c->compressedrow.rindex[k]; 2610 const PetscInt old = c->compressedrow.i[k]; 2611 for (; r < next; r++) c->i[r+1] = old; 2612 } 2613 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2614 } 2615 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2616 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2617 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2618 c->maxnz = c->nz; 2619 c->nonzerorowcnt = 0; 2620 c->rmax = 0; 2621 for (k = 0; k < m; k++) { 2622 const PetscInt nn = c->i[k+1] - c->i[k]; 2623 c->ilen[k] = c->imax[k] = nn; 2624 c->nonzerorowcnt += (PetscInt)!!nn; 2625 c->rmax = PetscMax(c->rmax,nn); 2626 } 2627 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2628 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2629 Ccsr->num_entries = c->nz; 2630 2631 C->nonzerostate++; 2632 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2633 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2634 Ccusp->nonzerostate = C->nonzerostate; 2635 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2636 C->preallocated = PETSC_TRUE; 2637 C->assembled = PETSC_FALSE; 2638 C->was_assembled = PETSC_FALSE; 2639 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2640 mmdata->reusesym = PETSC_TRUE; 2641 C->offloadmask = PETSC_OFFLOAD_GPU; 2642 } 2643 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2644 PetscFunctionReturn(0); 2645 } 2646 2647 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2648 2649 /* handles sparse or dense B */ 2650 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2651 { 2652 Mat_Product *product = mat->product; 2653 PetscErrorCode ierr; 2654 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2655 2656 PetscFunctionBegin; 2657 MatCheckProduct(mat,1); 2658 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2659 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2660 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2661 } 2662 if (product->type == MATPRODUCT_ABC) { 2663 Ciscusp = PETSC_FALSE; 2664 if (!product->C->boundtocpu) { 2665 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2666 } 2667 } 2668 if (isdense) { 2669 switch (product->type) { 2670 case MATPRODUCT_AB: 2671 case MATPRODUCT_AtB: 2672 case MATPRODUCT_ABt: 2673 case MATPRODUCT_PtAP: 2674 case MATPRODUCT_RARt: 2675 if (product->A->boundtocpu) { 2676 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2677 } else { 2678 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2679 } 2680 break; 2681 case MATPRODUCT_ABC: 2682 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2683 break; 2684 default: 2685 break; 2686 } 2687 } else if (Biscusp && Ciscusp) { 2688 switch (product->type) { 2689 case MATPRODUCT_AB: 2690 case MATPRODUCT_AtB: 2691 case MATPRODUCT_ABt: 2692 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2693 break; 2694 case MATPRODUCT_PtAP: 2695 case MATPRODUCT_RARt: 2696 case MATPRODUCT_ABC: 2697 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2698 break; 2699 default: 2700 break; 2701 } 2702 } else { /* fallback for AIJ */ 2703 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2704 } 2705 PetscFunctionReturn(0); 2706 } 2707 2708 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2709 { 2710 PetscErrorCode ierr; 2711 2712 PetscFunctionBegin; 2713 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2714 PetscFunctionReturn(0); 2715 } 2716 2717 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2718 { 2719 PetscErrorCode ierr; 2720 2721 PetscFunctionBegin; 2722 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2723 PetscFunctionReturn(0); 2724 } 2725 2726 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2727 { 2728 PetscErrorCode ierr; 2729 2730 PetscFunctionBegin; 2731 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2732 PetscFunctionReturn(0); 2733 } 2734 2735 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2736 { 2737 PetscErrorCode ierr; 2738 2739 PetscFunctionBegin; 2740 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2741 PetscFunctionReturn(0); 2742 } 2743 2744 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2745 { 2746 PetscErrorCode ierr; 2747 2748 PetscFunctionBegin; 2749 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2750 PetscFunctionReturn(0); 2751 } 2752 2753 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2754 { 2755 int i = blockIdx.x*blockDim.x + threadIdx.x; 2756 if (i < n) y[idx[i]] += x[i]; 2757 } 2758 2759 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2760 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2761 { 2762 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2763 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2764 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2765 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2766 PetscErrorCode ierr; 2767 cudaError_t cerr; 2768 cusparseStatus_t stat; 2769 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2770 PetscBool compressed; 2771 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2772 PetscInt nx,ny; 2773 #endif 2774 2775 PetscFunctionBegin; 2776 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2777 if (!a->nonzerorowcnt) { 2778 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2779 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2780 PetscFunctionReturn(0); 2781 } 2782 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2783 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2784 if (!trans) { 2785 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2786 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2787 } else { 2788 if (herm || !cusparsestruct->transgen) { 2789 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2790 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2791 } else { 2792 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);} 2793 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2794 } 2795 } 2796 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2797 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2798 2799 try { 2800 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2801 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2802 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2803 2804 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2805 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2806 /* z = A x + beta y. 2807 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2808 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2809 */ 2810 xptr = xarray; 2811 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2812 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2813 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2814 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2815 allocated to accommodate different uses. So we get the length info directly from mat. 2816 */ 2817 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2818 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2819 nx = mat->num_cols; 2820 ny = mat->num_rows; 2821 } 2822 #endif 2823 } else { 2824 /* z = A^T x + beta y 2825 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2826 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2827 */ 2828 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2829 dptr = zarray; 2830 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2831 if (compressed) { /* Scatter x to work vector */ 2832 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2833 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2834 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2835 VecCUDAEqualsReverse()); 2836 } 2837 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2838 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2839 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2840 nx = mat->num_rows; 2841 ny = mat->num_cols; 2842 } 2843 #endif 2844 } 2845 2846 /* csr_spmv does y = alpha op(A) x + beta y */ 2847 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2848 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2849 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2850 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2851 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2852 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2853 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2854 matstruct->matDescr, 2855 matstruct->cuSpMV[opA].vecXDescr, beta, 2856 matstruct->cuSpMV[opA].vecYDescr, 2857 cusparse_scalartype, 2858 cusparsestruct->spmvAlg, 2859 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2860 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2861 2862 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2863 } else { 2864 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2865 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2866 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2867 } 2868 2869 stat = cusparseSpMV(cusparsestruct->handle, opA, 2870 matstruct->alpha_one, 2871 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */ 2872 matstruct->cuSpMV[opA].vecXDescr, 2873 beta, 2874 matstruct->cuSpMV[opA].vecYDescr, 2875 cusparse_scalartype, 2876 cusparsestruct->spmvAlg, 2877 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2878 #else 2879 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2880 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2881 mat->num_rows, mat->num_cols, 2882 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2883 mat->values->data().get(), mat->row_offsets->data().get(), 2884 mat->column_indices->data().get(), xptr, beta, 2885 dptr);CHKERRCUSPARSE(stat); 2886 #endif 2887 } else { 2888 if (cusparsestruct->nrows) { 2889 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2890 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2891 #else 2892 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2893 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2894 matstruct->alpha_one, matstruct->descr, hybMat, 2895 xptr, beta, 2896 dptr);CHKERRCUSPARSE(stat); 2897 #endif 2898 } 2899 } 2900 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2901 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2902 2903 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2904 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2905 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2906 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2907 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2908 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2909 } 2910 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2911 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2912 } 2913 2914 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2915 if (compressed) { 2916 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2917 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 2918 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 2919 prevent that. So I just add a ScatterAdd kernel. 2920 */ 2921 #if 0 2922 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2923 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 2924 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2925 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2926 VecCUDAPlusEquals()); 2927 #else 2928 PetscInt n = matstruct->cprowIndices->size(); 2929 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 2930 #endif 2931 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2932 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2933 } 2934 } else { 2935 if (yy && yy != zz) { 2936 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2937 } 2938 } 2939 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2940 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2941 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2942 } catch(char *ex) { 2943 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2944 } 2945 if (yy) { 2946 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2947 } else { 2948 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2949 } 2950 PetscFunctionReturn(0); 2951 } 2952 2953 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2954 { 2955 PetscErrorCode ierr; 2956 2957 PetscFunctionBegin; 2958 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2959 PetscFunctionReturn(0); 2960 } 2961 2962 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2963 { 2964 PetscErrorCode ierr; 2965 PetscSplitCSRDataStructure *d_mat = NULL; 2966 PetscFunctionBegin; 2967 if (A->factortype == MAT_FACTOR_NONE) { 2968 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2969 } 2970 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2971 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2972 if (d_mat) { 2973 A->offloadmask = PETSC_OFFLOAD_GPU; 2974 } 2975 2976 PetscFunctionReturn(0); 2977 } 2978 2979 /* --------------------------------------------------------------------------------*/ 2980 /*@ 2981 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2982 (the default parallel PETSc format). This matrix will ultimately pushed down 2983 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2984 assembly performance the user should preallocate the matrix storage by setting 2985 the parameter nz (or the array nnz). By setting these parameters accurately, 2986 performance during matrix assembly can be increased by more than a factor of 50. 2987 2988 Collective 2989 2990 Input Parameters: 2991 + comm - MPI communicator, set to PETSC_COMM_SELF 2992 . m - number of rows 2993 . n - number of columns 2994 . nz - number of nonzeros per row (same for all rows) 2995 - nnz - array containing the number of nonzeros in the various rows 2996 (possibly different for each row) or NULL 2997 2998 Output Parameter: 2999 . A - the matrix 3000 3001 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3002 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3003 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3004 3005 Notes: 3006 If nnz is given then nz is ignored 3007 3008 The AIJ format (also called the Yale sparse matrix format or 3009 compressed row storage), is fully compatible with standard Fortran 77 3010 storage. That is, the stored row and column indices can begin at 3011 either one (as in Fortran) or zero. See the users' manual for details. 3012 3013 Specify the preallocated storage with either nz or nnz (not both). 3014 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3015 allocation. For large problems you MUST preallocate memory or you 3016 will get TERRIBLE performance, see the users' manual chapter on matrices. 3017 3018 By default, this format uses inodes (identical nodes) when possible, to 3019 improve numerical efficiency of matrix-vector products and solves. We 3020 search for consecutive rows with the same nonzero structure, thereby 3021 reusing matrix information to achieve increased efficiency. 3022 3023 Level: intermediate 3024 3025 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3026 @*/ 3027 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3028 { 3029 PetscErrorCode ierr; 3030 3031 PetscFunctionBegin; 3032 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3033 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3034 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3035 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3036 PetscFunctionReturn(0); 3037 } 3038 3039 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3040 { 3041 PetscErrorCode ierr; 3042 PetscSplitCSRDataStructure *d_mat = NULL; 3043 3044 PetscFunctionBegin; 3045 if (A->factortype == MAT_FACTOR_NONE) { 3046 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 3047 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 3048 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3049 } else { 3050 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3051 } 3052 if (d_mat) { 3053 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3054 cudaError_t err; 3055 PetscSplitCSRDataStructure h_mat; 3056 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 3057 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 3058 if (a->compressedrow.use) { 3059 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 3060 } 3061 err = cudaFree(d_mat);CHKERRCUDA(err); 3062 } 3063 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3064 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3065 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3066 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3067 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3068 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3069 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3070 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3071 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3072 PetscFunctionReturn(0); 3073 } 3074 3075 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3076 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3077 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3078 { 3079 PetscErrorCode ierr; 3080 3081 PetscFunctionBegin; 3082 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3083 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3084 PetscFunctionReturn(0); 3085 } 3086 3087 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3088 { 3089 PetscErrorCode ierr; 3090 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3091 Mat_SeqAIJCUSPARSE *cy; 3092 Mat_SeqAIJCUSPARSE *cx; 3093 PetscScalar *ay; 3094 const PetscScalar *ax; 3095 CsrMatrix *csry,*csrx; 3096 cudaError_t cerr; 3097 3098 PetscFunctionBegin; 3099 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3100 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3101 if (X->ops->axpy != Y->ops->axpy) { 3102 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3103 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3104 PetscFunctionReturn(0); 3105 } 3106 /* if we are here, it means both matrices are bound to GPU */ 3107 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3108 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3109 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3110 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3111 csry = (CsrMatrix*)cy->mat->mat; 3112 csrx = (CsrMatrix*)cx->mat->mat; 3113 /* see if we can turn this into a cublas axpy */ 3114 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3115 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3116 if (eq) { 3117 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3118 } 3119 if (eq) str = SAME_NONZERO_PATTERN; 3120 } 3121 /* spgeam is buggy with one column */ 3122 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3123 3124 if (str == SUBSET_NONZERO_PATTERN) { 3125 cusparseStatus_t stat; 3126 PetscScalar b = 1.0; 3127 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3128 size_t bufferSize; 3129 void *buffer; 3130 #endif 3131 3132 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3133 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3134 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3135 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3136 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3137 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3138 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3139 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3140 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3141 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3142 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3143 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3144 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3145 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3146 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3147 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3148 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3149 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3150 #else 3151 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3152 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3153 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3154 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3155 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3156 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3157 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3158 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3159 #endif 3160 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3161 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3162 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3163 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3164 } else if (str == SAME_NONZERO_PATTERN) { 3165 cublasHandle_t cublasv2handle; 3166 cublasStatus_t berr; 3167 PetscBLASInt one = 1, bnz = 1; 3168 3169 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3170 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3171 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3172 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3173 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3174 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3175 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3176 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3177 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3178 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3179 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3180 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3181 } else { 3182 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3183 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3184 } 3185 PetscFunctionReturn(0); 3186 } 3187 3188 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3189 { 3190 PetscErrorCode ierr; 3191 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3192 PetscScalar *ay; 3193 cudaError_t cerr; 3194 cublasHandle_t cublasv2handle; 3195 cublasStatus_t berr; 3196 PetscBLASInt one = 1, bnz = 1; 3197 3198 PetscFunctionBegin; 3199 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3200 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3201 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3202 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3203 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3204 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3205 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3206 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3207 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3208 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3209 PetscFunctionReturn(0); 3210 } 3211 3212 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3213 { 3214 PetscErrorCode ierr; 3215 PetscBool both = PETSC_FALSE; 3216 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3217 3218 PetscFunctionBegin; 3219 if (A->factortype == MAT_FACTOR_NONE) { 3220 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3221 if (spptr->mat) { 3222 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3223 if (matrix->values) { 3224 both = PETSC_TRUE; 3225 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3226 } 3227 } 3228 if (spptr->matTranspose) { 3229 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3230 if (matrix->values) { 3231 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3232 } 3233 } 3234 } 3235 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3236 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3237 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3238 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3239 else A->offloadmask = PETSC_OFFLOAD_CPU; 3240 3241 PetscFunctionReturn(0); 3242 } 3243 3244 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3245 { 3246 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3247 PetscErrorCode ierr; 3248 3249 PetscFunctionBegin; 3250 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3251 if (flg) { 3252 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3253 3254 A->ops->scale = MatScale_SeqAIJ; 3255 A->ops->axpy = MatAXPY_SeqAIJ; 3256 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3257 A->ops->mult = MatMult_SeqAIJ; 3258 A->ops->multadd = MatMultAdd_SeqAIJ; 3259 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3260 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3261 A->ops->multhermitiantranspose = NULL; 3262 A->ops->multhermitiantransposeadd = NULL; 3263 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3264 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3265 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3266 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3267 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3268 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3269 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3270 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3271 } else { 3272 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3273 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3274 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3275 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3276 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3277 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3278 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3279 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3280 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3281 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3282 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3283 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3284 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3285 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3286 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3287 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3288 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3289 } 3290 A->boundtocpu = flg; 3291 a->inode.use = flg; 3292 PetscFunctionReturn(0); 3293 } 3294 3295 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3296 { 3297 PetscErrorCode ierr; 3298 cusparseStatus_t stat; 3299 Mat B; 3300 3301 PetscFunctionBegin; 3302 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3303 if (reuse == MAT_INITIAL_MATRIX) { 3304 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3305 } else if (reuse == MAT_REUSE_MATRIX) { 3306 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3307 } 3308 B = *newmat; 3309 3310 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3311 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3312 3313 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3314 if (B->factortype == MAT_FACTOR_NONE) { 3315 Mat_SeqAIJCUSPARSE *spptr; 3316 3317 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3318 spptr->format = MAT_CUSPARSE_CSR; 3319 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3320 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3321 B->spptr = spptr; 3322 spptr->deviceMat = NULL; 3323 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3324 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3325 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3326 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3327 #endif 3328 } else { 3329 Mat_SeqAIJCUSPARSETriFactors *spptr; 3330 3331 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3332 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3333 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3334 B->spptr = spptr; 3335 } 3336 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3337 } 3338 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3339 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3340 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3341 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3342 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3343 3344 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3345 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3346 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3347 PetscFunctionReturn(0); 3348 } 3349 3350 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3351 { 3352 PetscErrorCode ierr; 3353 3354 PetscFunctionBegin; 3355 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3356 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3357 ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr); 3358 ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr); 3359 ierr = PetscOptionsEnd();CHKERRQ(ierr); 3360 PetscFunctionReturn(0); 3361 } 3362 3363 /*MC 3364 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3365 3366 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3367 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3368 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3369 3370 Options Database Keys: 3371 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3372 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3373 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3374 3375 Level: beginner 3376 3377 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3378 M*/ 3379 3380 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 3381 3382 3383 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3384 { 3385 PetscErrorCode ierr; 3386 3387 PetscFunctionBegin; 3388 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3389 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3390 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3391 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3392 PetscFunctionReturn(0); 3393 } 3394 3395 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3396 { 3397 PetscErrorCode ierr; 3398 cusparseStatus_t stat; 3399 3400 PetscFunctionBegin; 3401 if (*cusparsestruct) { 3402 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3403 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3404 delete (*cusparsestruct)->workVector; 3405 delete (*cusparsestruct)->rowoffsets_gpu; 3406 delete (*cusparsestruct)->cooPerm; 3407 delete (*cusparsestruct)->cooPerm_a; 3408 delete (*cusparsestruct)->csr2csc_i; 3409 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3410 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3411 } 3412 PetscFunctionReturn(0); 3413 } 3414 3415 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3416 { 3417 PetscFunctionBegin; 3418 if (*mat) { 3419 delete (*mat)->values; 3420 delete (*mat)->column_indices; 3421 delete (*mat)->row_offsets; 3422 delete *mat; 3423 *mat = 0; 3424 } 3425 PetscFunctionReturn(0); 3426 } 3427 3428 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3429 { 3430 cusparseStatus_t stat; 3431 PetscErrorCode ierr; 3432 3433 PetscFunctionBegin; 3434 if (*trifactor) { 3435 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3436 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3437 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3438 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3439 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3440 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3441 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3442 #endif 3443 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3444 } 3445 PetscFunctionReturn(0); 3446 } 3447 3448 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3449 { 3450 CsrMatrix *mat; 3451 cusparseStatus_t stat; 3452 cudaError_t err; 3453 3454 PetscFunctionBegin; 3455 if (*matstruct) { 3456 if ((*matstruct)->mat) { 3457 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3458 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3459 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3460 #else 3461 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3462 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3463 #endif 3464 } else { 3465 mat = (CsrMatrix*)(*matstruct)->mat; 3466 CsrMatrix_Destroy(&mat); 3467 } 3468 } 3469 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3470 delete (*matstruct)->cprowIndices; 3471 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3472 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3473 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3474 3475 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3476 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3477 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3478 for (int i=0; i<3; i++) { 3479 if (mdata->cuSpMV[i].initialized) { 3480 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3481 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3482 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3483 } 3484 } 3485 #endif 3486 delete *matstruct; 3487 *matstruct = NULL; 3488 } 3489 PetscFunctionReturn(0); 3490 } 3491 3492 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3493 { 3494 PetscErrorCode ierr; 3495 3496 PetscFunctionBegin; 3497 if (*trifactors) { 3498 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3499 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3500 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3501 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3502 delete (*trifactors)->rpermIndices; 3503 delete (*trifactors)->cpermIndices; 3504 delete (*trifactors)->workVector; 3505 (*trifactors)->rpermIndices = NULL; 3506 (*trifactors)->cpermIndices = NULL; 3507 (*trifactors)->workVector = NULL; 3508 } 3509 PetscFunctionReturn(0); 3510 } 3511 3512 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3513 { 3514 PetscErrorCode ierr; 3515 cusparseHandle_t handle; 3516 cusparseStatus_t stat; 3517 3518 PetscFunctionBegin; 3519 if (*trifactors) { 3520 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3521 if (handle = (*trifactors)->handle) { 3522 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3523 } 3524 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3525 } 3526 PetscFunctionReturn(0); 3527 } 3528 3529 struct IJCompare 3530 { 3531 __host__ __device__ 3532 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3533 { 3534 if (t1.get<0>() < t2.get<0>()) return true; 3535 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3536 return false; 3537 } 3538 }; 3539 3540 struct IJEqual 3541 { 3542 __host__ __device__ 3543 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3544 { 3545 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3546 return true; 3547 } 3548 }; 3549 3550 struct IJDiff 3551 { 3552 __host__ __device__ 3553 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3554 { 3555 return t1 == t2 ? 0 : 1; 3556 } 3557 }; 3558 3559 struct IJSum 3560 { 3561 __host__ __device__ 3562 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3563 { 3564 return t1||t2; 3565 } 3566 }; 3567 3568 #include <thrust/iterator/discard_iterator.h> 3569 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3570 { 3571 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3572 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3573 THRUSTARRAY *cooPerm_v = NULL; 3574 thrust::device_ptr<const PetscScalar> d_v; 3575 CsrMatrix *matrix; 3576 PetscErrorCode ierr; 3577 cudaError_t cerr; 3578 PetscInt n; 3579 3580 PetscFunctionBegin; 3581 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3582 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3583 if (!cusp->cooPerm) { 3584 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3585 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3586 PetscFunctionReturn(0); 3587 } 3588 matrix = (CsrMatrix*)cusp->mat->mat; 3589 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3590 if (!v) { 3591 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3592 goto finalize; 3593 } 3594 n = cusp->cooPerm->size(); 3595 if (isCudaMem(v)) { 3596 d_v = thrust::device_pointer_cast(v); 3597 } else { 3598 cooPerm_v = new THRUSTARRAY(n); 3599 cooPerm_v->assign(v,v+n); 3600 d_v = cooPerm_v->data(); 3601 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3602 } 3603 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3604 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3605 if (cusp->cooPerm_a) { 3606 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3607 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3608 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3609 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3610 delete cooPerm_w; 3611 } else { 3612 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3613 matrix->values->begin())); 3614 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3615 matrix->values->end())); 3616 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 3617 } 3618 } else { 3619 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3620 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3621 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3622 } else { 3623 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3624 matrix->values->begin())); 3625 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3626 matrix->values->end())); 3627 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3628 } 3629 } 3630 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3631 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3632 finalize: 3633 delete cooPerm_v; 3634 A->offloadmask = PETSC_OFFLOAD_GPU; 3635 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3636 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3637 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3638 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3639 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3640 a->reallocs = 0; 3641 A->info.mallocs += 0; 3642 A->info.nz_unneeded = 0; 3643 A->assembled = A->was_assembled = PETSC_TRUE; 3644 A->num_ass++; 3645 PetscFunctionReturn(0); 3646 } 3647 3648 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3649 { 3650 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3651 PetscErrorCode ierr; 3652 3653 PetscFunctionBegin; 3654 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3655 if (!cusp) PetscFunctionReturn(0); 3656 if (destroy) { 3657 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3658 delete cusp->csr2csc_i; 3659 cusp->csr2csc_i = NULL; 3660 } 3661 cusp->transupdated = PETSC_FALSE; 3662 PetscFunctionReturn(0); 3663 } 3664 3665 #include <thrust/binary_search.h> 3666 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3667 { 3668 PetscErrorCode ierr; 3669 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3670 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3671 PetscInt cooPerm_n, nzr = 0; 3672 cudaError_t cerr; 3673 3674 PetscFunctionBegin; 3675 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3676 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3677 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3678 if (n != cooPerm_n) { 3679 delete cusp->cooPerm; 3680 delete cusp->cooPerm_a; 3681 cusp->cooPerm = NULL; 3682 cusp->cooPerm_a = NULL; 3683 } 3684 if (n) { 3685 THRUSTINTARRAY d_i(n); 3686 THRUSTINTARRAY d_j(n); 3687 THRUSTINTARRAY ii(A->rmap->n); 3688 3689 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3690 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3691 3692 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3693 d_i.assign(coo_i,coo_i+n); 3694 d_j.assign(coo_j,coo_j+n); 3695 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3696 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3697 3698 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3699 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3700 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 3701 *cusp->cooPerm_a = d_i; 3702 THRUSTINTARRAY w = d_j; 3703 3704 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 3705 if (nekey == ekey) { /* all entries are unique */ 3706 delete cusp->cooPerm_a; 3707 cusp->cooPerm_a = NULL; 3708 } else { /* I couldn't come up with a more elegant algorithm */ 3709 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 3710 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 3711 (*cusp->cooPerm_a)[0] = 0; 3712 w[0] = 0; 3713 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 3714 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 3715 } 3716 thrust::counting_iterator<PetscInt> search_begin(0); 3717 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 3718 search_begin, search_begin + A->rmap->n, 3719 ii.begin()); 3720 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3721 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3722 3723 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3724 a->singlemalloc = PETSC_FALSE; 3725 a->free_a = PETSC_TRUE; 3726 a->free_ij = PETSC_TRUE; 3727 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3728 a->i[0] = 0; 3729 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3730 a->nz = a->maxnz = a->i[A->rmap->n]; 3731 a->rmax = 0; 3732 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3733 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3734 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3735 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3736 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3737 for (PetscInt i = 0; i < A->rmap->n; i++) { 3738 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3739 nzr += (PetscInt)!!(nnzr); 3740 a->ilen[i] = a->imax[i] = nnzr; 3741 a->rmax = PetscMax(a->rmax,nnzr); 3742 } 3743 a->nonzerorowcnt = nzr; 3744 A->preallocated = PETSC_TRUE; 3745 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3746 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3747 } else { 3748 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3749 } 3750 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3751 3752 /* We want to allocate the CUSPARSE struct for matvec now. 3753 The code is so convoluted now that I prefer to copy zeros */ 3754 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3755 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3756 A->offloadmask = PETSC_OFFLOAD_CPU; 3757 A->nonzerostate++; 3758 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3759 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 3760 3761 A->assembled = PETSC_FALSE; 3762 A->was_assembled = PETSC_FALSE; 3763 PetscFunctionReturn(0); 3764 } 3765 3766 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 3767 { 3768 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3769 CsrMatrix *csr; 3770 PetscErrorCode ierr; 3771 3772 PetscFunctionBegin; 3773 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3774 PetscValidPointer(a,2); 3775 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3776 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3777 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3778 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3779 csr = (CsrMatrix*)cusp->mat->mat; 3780 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3781 *a = csr->values->data().get(); 3782 PetscFunctionReturn(0); 3783 } 3784 3785 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 3786 { 3787 PetscFunctionBegin; 3788 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3789 PetscValidPointer(a,2); 3790 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3791 *a = NULL; 3792 PetscFunctionReturn(0); 3793 } 3794 3795 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 3796 { 3797 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3798 CsrMatrix *csr; 3799 PetscErrorCode ierr; 3800 3801 PetscFunctionBegin; 3802 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3803 PetscValidPointer(a,2); 3804 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3805 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3806 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3807 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3808 csr = (CsrMatrix*)cusp->mat->mat; 3809 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3810 *a = csr->values->data().get(); 3811 A->offloadmask = PETSC_OFFLOAD_GPU; 3812 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3813 PetscFunctionReturn(0); 3814 } 3815 3816 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 3817 { 3818 PetscErrorCode ierr; 3819 3820 PetscFunctionBegin; 3821 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3822 PetscValidPointer(a,2); 3823 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3824 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3825 *a = NULL; 3826 PetscFunctionReturn(0); 3827 } 3828 3829 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 3830 { 3831 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3832 CsrMatrix *csr; 3833 PetscErrorCode ierr; 3834 3835 PetscFunctionBegin; 3836 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3837 PetscValidPointer(a,2); 3838 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3839 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3840 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3841 csr = (CsrMatrix*)cusp->mat->mat; 3842 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3843 *a = csr->values->data().get(); 3844 A->offloadmask = PETSC_OFFLOAD_GPU; 3845 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3846 PetscFunctionReturn(0); 3847 } 3848 3849 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 3850 { 3851 PetscErrorCode ierr; 3852 3853 PetscFunctionBegin; 3854 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3855 PetscValidPointer(a,2); 3856 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3857 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3858 *a = NULL; 3859 PetscFunctionReturn(0); 3860 } 3861 3862 struct IJCompare4 3863 { 3864 __host__ __device__ 3865 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 3866 { 3867 if (t1.get<0>() < t2.get<0>()) return true; 3868 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3869 return false; 3870 } 3871 }; 3872 3873 struct Shift 3874 { 3875 int _shift; 3876 3877 Shift(int shift) : _shift(shift) {} 3878 __host__ __device__ 3879 inline int operator() (const int &c) 3880 { 3881 return c + _shift; 3882 } 3883 }; 3884 3885 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */ 3886 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 3887 { 3888 PetscErrorCode ierr; 3889 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 3890 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 3891 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 3892 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3893 PetscInt Annz,Bnnz; 3894 cusparseStatus_t stat; 3895 PetscInt i,m,n,zero = 0; 3896 cudaError_t cerr; 3897 3898 PetscFunctionBegin; 3899 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3900 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 3901 PetscValidPointer(C,4); 3902 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3903 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 3904 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 3905 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 3906 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3907 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3908 if (reuse == MAT_INITIAL_MATRIX) { 3909 m = A->rmap->n; 3910 n = A->cmap->n + B->cmap->n; 3911 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 3912 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 3913 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3914 c = (Mat_SeqAIJ*)(*C)->data; 3915 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3916 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3917 Ccsr = new CsrMatrix; 3918 Cmat->cprowIndices = NULL; 3919 c->compressedrow.use = PETSC_FALSE; 3920 c->compressedrow.nrows = 0; 3921 c->compressedrow.i = NULL; 3922 c->compressedrow.rindex = NULL; 3923 Ccusp->workVector = NULL; 3924 Ccusp->nrows = m; 3925 Ccusp->mat = Cmat; 3926 Ccusp->mat->mat = Ccsr; 3927 Ccsr->num_rows = m; 3928 Ccsr->num_cols = n; 3929 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 3930 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3931 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3932 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3933 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3934 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3935 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3936 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3937 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3938 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3939 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 3940 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 3941 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr); 3942 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3943 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3944 3945 Acsr = (CsrMatrix*)Acusp->mat->mat; 3946 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 3947 Annz = (PetscInt)Acsr->column_indices->size(); 3948 Bnnz = (PetscInt)Bcsr->column_indices->size(); 3949 c->nz = Annz + Bnnz; 3950 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 3951 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3952 Ccsr->values = new THRUSTARRAY(c->nz); 3953 Ccsr->num_entries = c->nz; 3954 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 3955 if (c->nz) { 3956 auto Acoo = new THRUSTINTARRAY32(Annz); 3957 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 3958 auto Ccoo = new THRUSTINTARRAY32(c->nz); 3959 THRUSTINTARRAY32 *Aroff,*Broff; 3960 3961 if (a->compressedrow.use) { /* need full row offset */ 3962 if (!Acusp->rowoffsets_gpu) { 3963 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 3964 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 3965 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3966 } 3967 Aroff = Acusp->rowoffsets_gpu; 3968 } else Aroff = Acsr->row_offsets; 3969 if (b->compressedrow.use) { /* need full row offset */ 3970 if (!Bcusp->rowoffsets_gpu) { 3971 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3972 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 3973 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3974 } 3975 Broff = Bcusp->rowoffsets_gpu; 3976 } else Broff = Bcsr->row_offsets; 3977 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3978 stat = cusparseXcsr2coo(Acusp->handle, 3979 Aroff->data().get(), 3980 Annz, 3981 m, 3982 Acoo->data().get(), 3983 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3984 stat = cusparseXcsr2coo(Bcusp->handle, 3985 Broff->data().get(), 3986 Bnnz, 3987 m, 3988 Bcoo->data().get(), 3989 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3990 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 3991 auto Aperm = thrust::make_constant_iterator(1); 3992 auto Bperm = thrust::make_constant_iterator(0); 3993 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 3994 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 3995 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 3996 #else 3997 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 3998 auto Bcib = Bcsr->column_indices->begin(); 3999 auto Bcie = Bcsr->column_indices->end(); 4000 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4001 #endif 4002 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4003 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4004 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4005 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4006 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4007 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4008 auto p1 = Ccusp->cooPerm->begin(); 4009 auto p2 = Ccusp->cooPerm->begin(); 4010 thrust::advance(p2,Annz); 4011 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4012 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4013 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4014 #endif 4015 auto cci = thrust::make_counting_iterator(zero); 4016 auto cce = thrust::make_counting_iterator(c->nz); 4017 #if 0 //Errors on SUMMIT cuda 11.1.0 4018 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4019 #else 4020 auto pred = thrust::identity<int>(); 4021 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4022 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4023 #endif 4024 stat = cusparseXcoo2csr(Ccusp->handle, 4025 Ccoo->data().get(), 4026 c->nz, 4027 m, 4028 Ccsr->row_offsets->data().get(), 4029 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4030 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4031 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4032 delete wPerm; 4033 delete Acoo; 4034 delete Bcoo; 4035 delete Ccoo; 4036 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4037 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4038 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4039 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4040 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4041 #endif 4042 if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */ 4043 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4044 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4045 CsrMatrix *CcsrT = new CsrMatrix; 4046 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4047 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4048 4049 Ccusp->transgen = PETSC_TRUE; 4050 Ccusp->transupdated = PETSC_TRUE; 4051 Ccusp->rowoffsets_gpu = NULL; 4052 CmatT->cprowIndices = NULL; 4053 CmatT->mat = CcsrT; 4054 CcsrT->num_rows = n; 4055 CcsrT->num_cols = m; 4056 CcsrT->num_entries = c->nz; 4057 4058 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4059 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4060 CcsrT->values = new THRUSTARRAY(c->nz); 4061 4062 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4063 auto rT = CcsrT->row_offsets->begin(); 4064 if (AT) { 4065 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4066 thrust::advance(rT,-1); 4067 } 4068 if (BT) { 4069 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4070 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4071 thrust::copy(titb,tite,rT); 4072 } 4073 auto cT = CcsrT->column_indices->begin(); 4074 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4075 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4076 auto vT = CcsrT->values->begin(); 4077 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4078 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4079 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4080 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4081 4082 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4083 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4084 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4085 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4086 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4087 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4088 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4089 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4090 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4091 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4092 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4093 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4094 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4095 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4096 #endif 4097 Ccusp->matTranspose = CmatT; 4098 } 4099 } 4100 4101 c->singlemalloc = PETSC_FALSE; 4102 c->free_a = PETSC_TRUE; 4103 c->free_ij = PETSC_TRUE; 4104 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4105 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4106 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4107 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4108 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4109 ii = *Ccsr->row_offsets; 4110 jj = *Ccsr->column_indices; 4111 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4112 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4113 } else { 4114 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4115 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4116 } 4117 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4118 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4119 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4120 c->maxnz = c->nz; 4121 c->nonzerorowcnt = 0; 4122 c->rmax = 0; 4123 for (i = 0; i < m; i++) { 4124 const PetscInt nn = c->i[i+1] - c->i[i]; 4125 c->ilen[i] = c->imax[i] = nn; 4126 c->nonzerorowcnt += (PetscInt)!!nn; 4127 c->rmax = PetscMax(c->rmax,nn); 4128 } 4129 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4130 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4131 (*C)->nonzerostate++; 4132 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4133 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4134 Ccusp->nonzerostate = (*C)->nonzerostate; 4135 (*C)->preallocated = PETSC_TRUE; 4136 } else { 4137 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4138 c = (Mat_SeqAIJ*)(*C)->data; 4139 if (c->nz) { 4140 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4141 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4142 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4143 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4144 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4145 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4146 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4147 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4148 Acsr = (CsrMatrix*)Acusp->mat->mat; 4149 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4150 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4151 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4152 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4153 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4154 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4155 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4156 auto pmid = Ccusp->cooPerm->begin(); 4157 thrust::advance(pmid,Acsr->num_entries); 4158 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4159 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4160 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4161 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4162 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4163 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4164 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4165 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4166 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4167 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4168 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4169 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4170 if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) { 4171 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4172 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4173 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4174 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4175 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4176 auto vT = CcsrT->values->begin(); 4177 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4178 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4179 Ccusp->transupdated = PETSC_TRUE; 4180 } 4181 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4182 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4183 } 4184 } 4185 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4186 (*C)->assembled = PETSC_TRUE; 4187 (*C)->was_assembled = PETSC_FALSE; 4188 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4189 PetscFunctionReturn(0); 4190 } 4191 4192 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4193 { 4194 PetscErrorCode ierr; 4195 bool dmem; 4196 const PetscScalar *av; 4197 cudaError_t cerr; 4198 4199 PetscFunctionBegin; 4200 dmem = isCudaMem(v); 4201 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4202 if (n && idx) { 4203 THRUSTINTARRAY widx(n); 4204 widx.assign(idx,idx+n); 4205 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4206 4207 THRUSTARRAY *w = NULL; 4208 thrust::device_ptr<PetscScalar> dv; 4209 if (dmem) { 4210 dv = thrust::device_pointer_cast(v); 4211 } else { 4212 w = new THRUSTARRAY(n); 4213 dv = w->data(); 4214 } 4215 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4216 4217 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4218 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4219 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4220 if (w) { 4221 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4222 } 4223 delete w; 4224 } else { 4225 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4226 } 4227 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4228 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4229 PetscFunctionReturn(0); 4230 } 4231