1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 #include <thrust/async/for_each.h> 17 18 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 19 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 20 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 21 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 22 23 typedef enum { 24 CUSPARSE_MV_ALG_DEFAULT = 0, 25 CUSPARSE_COOMV_ALG = 1, 26 CUSPARSE_CSRMV_ALG1 = 2, 27 CUSPARSE_CSRMV_ALG2 = 3 28 } cusparseSpMVAlg_t; 29 30 typedef enum { 31 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 32 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 33 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 34 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 35 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 36 CUSPARSE_SPMM_ALG_DEFAULT = 0, 37 CUSPARSE_SPMM_COO_ALG1 = 1, 38 CUSPARSE_SPMM_COO_ALG2 = 2, 39 CUSPARSE_SPMM_COO_ALG3 = 3, 40 CUSPARSE_SPMM_COO_ALG4 = 5, 41 CUSPARSE_SPMM_CSR_ALG1 = 4, 42 CUSPARSE_SPMM_CSR_ALG2 = 6, 43 } cusparseSpMMAlg_t; 44 45 typedef enum { 46 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 47 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 48 } cusparseCsr2CscAlg_t; 49 */ 50 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 51 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 52 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 53 #endif 54 55 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 57 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 58 59 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 61 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 64 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 66 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 67 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 68 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 69 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 70 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 71 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 72 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 73 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 74 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 75 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 76 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 77 78 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 80 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 81 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 82 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 83 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 84 85 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 86 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 87 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 88 89 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 90 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 91 92 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 93 94 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 95 { 96 cusparseStatus_t stat; 97 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 98 99 PetscFunctionBegin; 100 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 101 cusparsestruct->stream = stream; 102 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 103 PetscFunctionReturn(0); 104 } 105 106 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 107 { 108 cusparseStatus_t stat; 109 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 110 111 PetscFunctionBegin; 112 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 113 if (cusparsestruct->handle != handle) { 114 if (cusparsestruct->handle) { 115 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 116 } 117 cusparsestruct->handle = handle; 118 } 119 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 120 PetscFunctionReturn(0); 121 } 122 123 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 124 { 125 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 126 PetscBool flg; 127 PetscErrorCode ierr; 128 129 PetscFunctionBegin; 130 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 131 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 132 if (cusparsestruct->handle) cusparsestruct->handle = 0; 133 PetscFunctionReturn(0); 134 } 135 136 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 137 { 138 PetscFunctionBegin; 139 *type = MATSOLVERCUSPARSE; 140 PetscFunctionReturn(0); 141 } 142 143 /*MC 144 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 145 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 146 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 147 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 148 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 149 algorithms are not recommended. This class does NOT support direct solver operations. 150 151 Level: beginner 152 153 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 154 M*/ 155 156 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 157 { 158 PetscErrorCode ierr; 159 PetscInt n = A->rmap->n; 160 161 PetscFunctionBegin; 162 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 163 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 164 (*B)->factortype = ftype; 165 (*B)->useordering = PETSC_TRUE; 166 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 167 168 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 169 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 170 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 171 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 172 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 173 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 174 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 175 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 176 177 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 178 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 179 PetscFunctionReturn(0); 180 } 181 182 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 183 { 184 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 185 186 PetscFunctionBegin; 187 switch (op) { 188 case MAT_CUSPARSE_MULT: 189 cusparsestruct->format = format; 190 break; 191 case MAT_CUSPARSE_ALL: 192 cusparsestruct->format = format; 193 break; 194 default: 195 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 196 } 197 PetscFunctionReturn(0); 198 } 199 200 /*@ 201 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 202 operation. Only the MatMult operation can use different GPU storage formats 203 for MPIAIJCUSPARSE matrices. 204 Not Collective 205 206 Input Parameters: 207 + A - Matrix of type SEQAIJCUSPARSE 208 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 209 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 210 211 Output Parameter: 212 213 Level: intermediate 214 215 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 216 @*/ 217 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 218 { 219 PetscErrorCode ierr; 220 221 PetscFunctionBegin; 222 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 223 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 224 PetscFunctionReturn(0); 225 } 226 227 /*@ 228 MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose 229 230 Collective on mat 231 232 Input Parameters: 233 + A - Matrix of type SEQAIJCUSPARSE 234 - transgen - the boolean flag 235 236 Level: intermediate 237 238 .seealso: MATSEQAIJCUSPARSE, MatAIJCUSPARSESetGenerateTranspose() 239 @*/ 240 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 241 { 242 PetscErrorCode ierr; 243 PetscBool flg; 244 245 PetscFunctionBegin; 246 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 247 ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 248 if (flg) { 249 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 250 251 if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 252 cusp->transgen = transgen; 253 if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 254 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 255 } 256 } 257 PetscFunctionReturn(0); 258 } 259 260 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 261 { 262 PetscErrorCode ierr; 263 MatCUSPARSEStorageFormat format; 264 PetscBool flg; 265 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 266 267 PetscFunctionBegin; 268 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 269 if (A->factortype == MAT_FACTOR_NONE) { 270 PetscBool transgen = cusparsestruct->transgen; 271 272 ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 273 if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);} 274 275 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 276 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 277 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 278 279 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 280 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 281 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 282 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 283 cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 284 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 285 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 286 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 287 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 288 289 cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 290 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 291 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 292 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 293 294 cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 295 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 296 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 297 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 298 #endif 299 } 300 ierr = PetscOptionsTail();CHKERRQ(ierr); 301 PetscFunctionReturn(0); 302 } 303 304 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 305 { 306 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 307 PetscErrorCode ierr; 308 309 PetscFunctionBegin; 310 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 311 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 312 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 313 PetscFunctionReturn(0); 314 } 315 316 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 317 { 318 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 319 PetscErrorCode ierr; 320 321 PetscFunctionBegin; 322 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 323 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 324 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 325 PetscFunctionReturn(0); 326 } 327 328 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 329 { 330 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 331 PetscErrorCode ierr; 332 333 PetscFunctionBegin; 334 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 335 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 336 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 337 PetscFunctionReturn(0); 338 } 339 340 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 341 { 342 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 343 PetscErrorCode ierr; 344 345 PetscFunctionBegin; 346 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 347 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 348 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 349 PetscFunctionReturn(0); 350 } 351 352 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 353 { 354 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 355 PetscInt n = A->rmap->n; 356 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 357 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 358 cusparseStatus_t stat; 359 const PetscInt *ai = a->i,*aj = a->j,*vi; 360 const MatScalar *aa = a->a,*v; 361 PetscInt *AiLo, *AjLo; 362 PetscInt i,nz, nzLower, offset, rowOffset; 363 PetscErrorCode ierr; 364 cudaError_t cerr; 365 366 PetscFunctionBegin; 367 if (!n) PetscFunctionReturn(0); 368 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 369 try { 370 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 371 nzLower=n+ai[n]-ai[1]; 372 if (!loTriFactor) { 373 PetscScalar *AALo; 374 375 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 376 377 /* Allocate Space for the lower triangular matrix */ 378 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 379 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 380 381 /* Fill the lower triangular matrix */ 382 AiLo[0] = (PetscInt) 0; 383 AiLo[n] = nzLower; 384 AjLo[0] = (PetscInt) 0; 385 AALo[0] = (MatScalar) 1.0; 386 v = aa; 387 vi = aj; 388 offset = 1; 389 rowOffset= 1; 390 for (i=1; i<n; i++) { 391 nz = ai[i+1] - ai[i]; 392 /* additional 1 for the term on the diagonal */ 393 AiLo[i] = rowOffset; 394 rowOffset += nz+1; 395 396 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 397 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 398 399 offset += nz; 400 AjLo[offset] = (PetscInt) i; 401 AALo[offset] = (MatScalar) 1.0; 402 offset += 1; 403 404 v += nz; 405 vi += nz; 406 } 407 408 /* allocate space for the triangular factor information */ 409 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 410 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 411 /* Create the matrix description */ 412 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 413 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 414 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 415 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 416 #else 417 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 418 #endif 419 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 420 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 421 422 /* set the operation */ 423 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 424 425 /* set the matrix */ 426 loTriFactor->csrMat = new CsrMatrix; 427 loTriFactor->csrMat->num_rows = n; 428 loTriFactor->csrMat->num_cols = n; 429 loTriFactor->csrMat->num_entries = nzLower; 430 431 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 432 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 433 434 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 435 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 436 437 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 438 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 439 440 /* Create the solve analysis information */ 441 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 442 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 443 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 444 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 445 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 446 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 447 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 448 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 449 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 450 #endif 451 452 /* perform the solve analysis */ 453 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 454 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 455 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 456 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 457 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 458 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 459 #endif 460 );CHKERRCUSPARSE(stat); 461 cerr = WaitForCUDA();CHKERRCUDA(cerr); 462 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 463 464 /* assign the pointer */ 465 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 466 loTriFactor->AA_h = AALo; 467 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 468 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 469 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 470 } else { /* update values only */ 471 if (!loTriFactor->AA_h) { 472 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 473 } 474 /* Fill the lower triangular matrix */ 475 loTriFactor->AA_h[0] = 1.0; 476 v = aa; 477 vi = aj; 478 offset = 1; 479 for (i=1; i<n; i++) { 480 nz = ai[i+1] - ai[i]; 481 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 482 offset += nz; 483 loTriFactor->AA_h[offset] = 1.0; 484 offset += 1; 485 v += nz; 486 } 487 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 488 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 489 } 490 } catch(char *ex) { 491 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 492 } 493 } 494 PetscFunctionReturn(0); 495 } 496 497 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 498 { 499 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 500 PetscInt n = A->rmap->n; 501 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 502 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 503 cusparseStatus_t stat; 504 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 505 const MatScalar *aa = a->a,*v; 506 PetscInt *AiUp, *AjUp; 507 PetscInt i,nz, nzUpper, offset; 508 PetscErrorCode ierr; 509 cudaError_t cerr; 510 511 PetscFunctionBegin; 512 if (!n) PetscFunctionReturn(0); 513 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 514 try { 515 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 516 nzUpper = adiag[0]-adiag[n]; 517 if (!upTriFactor) { 518 PetscScalar *AAUp; 519 520 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 521 522 /* Allocate Space for the upper triangular matrix */ 523 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 524 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 525 526 /* Fill the upper triangular matrix */ 527 AiUp[0]=(PetscInt) 0; 528 AiUp[n]=nzUpper; 529 offset = nzUpper; 530 for (i=n-1; i>=0; i--) { 531 v = aa + adiag[i+1] + 1; 532 vi = aj + adiag[i+1] + 1; 533 534 /* number of elements NOT on the diagonal */ 535 nz = adiag[i] - adiag[i+1]-1; 536 537 /* decrement the offset */ 538 offset -= (nz+1); 539 540 /* first, set the diagonal elements */ 541 AjUp[offset] = (PetscInt) i; 542 AAUp[offset] = (MatScalar)1./v[nz]; 543 AiUp[i] = AiUp[i+1] - (nz+1); 544 545 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 546 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 547 } 548 549 /* allocate space for the triangular factor information */ 550 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 551 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 552 553 /* Create the matrix description */ 554 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 555 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 556 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 557 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 558 #else 559 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 560 #endif 561 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 562 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 563 564 /* set the operation */ 565 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 566 567 /* set the matrix */ 568 upTriFactor->csrMat = new CsrMatrix; 569 upTriFactor->csrMat->num_rows = n; 570 upTriFactor->csrMat->num_cols = n; 571 upTriFactor->csrMat->num_entries = nzUpper; 572 573 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 574 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 575 576 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 577 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 578 579 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 580 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 581 582 /* Create the solve analysis information */ 583 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 584 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 585 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 586 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 587 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 588 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 589 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 590 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 591 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 592 #endif 593 594 /* perform the solve analysis */ 595 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 596 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 597 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 598 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 599 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 600 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 601 #endif 602 );CHKERRCUSPARSE(stat); 603 cerr = WaitForCUDA();CHKERRCUDA(cerr); 604 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 605 606 /* assign the pointer */ 607 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 608 upTriFactor->AA_h = AAUp; 609 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 610 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 611 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 612 } else { 613 if (!upTriFactor->AA_h) { 614 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 615 } 616 /* Fill the upper triangular matrix */ 617 offset = nzUpper; 618 for (i=n-1; i>=0; i--) { 619 v = aa + adiag[i+1] + 1; 620 621 /* number of elements NOT on the diagonal */ 622 nz = adiag[i] - adiag[i+1]-1; 623 624 /* decrement the offset */ 625 offset -= (nz+1); 626 627 /* first, set the diagonal elements */ 628 upTriFactor->AA_h[offset] = 1./v[nz]; 629 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 630 } 631 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 632 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 633 } 634 } catch(char *ex) { 635 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 636 } 637 } 638 PetscFunctionReturn(0); 639 } 640 641 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 642 { 643 PetscErrorCode ierr; 644 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 645 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 646 IS isrow = a->row,iscol = a->icol; 647 PetscBool row_identity,col_identity; 648 PetscInt n = A->rmap->n; 649 650 PetscFunctionBegin; 651 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 652 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 653 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 654 655 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 656 cusparseTriFactors->nnz=a->nz; 657 658 A->offloadmask = PETSC_OFFLOAD_BOTH; 659 /* lower triangular indices */ 660 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 661 if (!row_identity && !cusparseTriFactors->rpermIndices) { 662 const PetscInt *r; 663 664 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 665 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 666 cusparseTriFactors->rpermIndices->assign(r, r+n); 667 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 668 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 669 } 670 671 /* upper triangular indices */ 672 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 673 if (!col_identity && !cusparseTriFactors->cpermIndices) { 674 const PetscInt *c; 675 676 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 677 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 678 cusparseTriFactors->cpermIndices->assign(c, c+n); 679 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 680 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 681 } 682 PetscFunctionReturn(0); 683 } 684 685 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 686 { 687 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 688 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 689 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 690 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 691 cusparseStatus_t stat; 692 PetscErrorCode ierr; 693 cudaError_t cerr; 694 PetscInt *AiUp, *AjUp; 695 PetscScalar *AAUp; 696 PetscScalar *AALo; 697 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 698 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 699 const PetscInt *ai = b->i,*aj = b->j,*vj; 700 const MatScalar *aa = b->a,*v; 701 702 PetscFunctionBegin; 703 if (!n) PetscFunctionReturn(0); 704 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 705 try { 706 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 707 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 708 if (!upTriFactor && !loTriFactor) { 709 /* Allocate Space for the upper triangular matrix */ 710 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 711 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 712 713 /* Fill the upper triangular matrix */ 714 AiUp[0]=(PetscInt) 0; 715 AiUp[n]=nzUpper; 716 offset = 0; 717 for (i=0; i<n; i++) { 718 /* set the pointers */ 719 v = aa + ai[i]; 720 vj = aj + ai[i]; 721 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 722 723 /* first, set the diagonal elements */ 724 AjUp[offset] = (PetscInt) i; 725 AAUp[offset] = (MatScalar)1.0/v[nz]; 726 AiUp[i] = offset; 727 AALo[offset] = (MatScalar)1.0/v[nz]; 728 729 offset+=1; 730 if (nz>0) { 731 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 732 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 733 for (j=offset; j<offset+nz; j++) { 734 AAUp[j] = -AAUp[j]; 735 AALo[j] = AAUp[j]/v[nz]; 736 } 737 offset+=nz; 738 } 739 } 740 741 /* allocate space for the triangular factor information */ 742 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 743 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 744 745 /* Create the matrix description */ 746 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 747 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 748 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 749 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 750 #else 751 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 752 #endif 753 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 754 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 755 756 /* set the matrix */ 757 upTriFactor->csrMat = new CsrMatrix; 758 upTriFactor->csrMat->num_rows = A->rmap->n; 759 upTriFactor->csrMat->num_cols = A->cmap->n; 760 upTriFactor->csrMat->num_entries = a->nz; 761 762 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 763 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 764 765 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 766 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 767 768 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 769 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 770 771 /* set the operation */ 772 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 773 774 /* Create the solve analysis information */ 775 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 776 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 777 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 778 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 779 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 780 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 781 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 782 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 783 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 784 #endif 785 786 /* perform the solve analysis */ 787 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 788 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 789 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 790 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 791 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 792 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 793 #endif 794 );CHKERRCUSPARSE(stat); 795 cerr = WaitForCUDA();CHKERRCUDA(cerr); 796 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 797 798 /* assign the pointer */ 799 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 800 801 /* allocate space for the triangular factor information */ 802 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 803 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 804 805 /* Create the matrix description */ 806 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 807 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 808 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 809 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 810 #else 811 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 812 #endif 813 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 814 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 815 816 /* set the operation */ 817 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 818 819 /* set the matrix */ 820 loTriFactor->csrMat = new CsrMatrix; 821 loTriFactor->csrMat->num_rows = A->rmap->n; 822 loTriFactor->csrMat->num_cols = A->cmap->n; 823 loTriFactor->csrMat->num_entries = a->nz; 824 825 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 826 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 827 828 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 829 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 830 831 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 832 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 833 834 /* Create the solve analysis information */ 835 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 836 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 837 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 838 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 839 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 840 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 841 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 842 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 843 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 844 #endif 845 846 /* perform the solve analysis */ 847 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 848 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 849 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 850 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 851 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 852 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 853 #endif 854 );CHKERRCUSPARSE(stat); 855 cerr = WaitForCUDA();CHKERRCUDA(cerr); 856 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 857 858 /* assign the pointer */ 859 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 860 861 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 862 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 863 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 864 } else { 865 /* Fill the upper triangular matrix */ 866 offset = 0; 867 for (i=0; i<n; i++) { 868 /* set the pointers */ 869 v = aa + ai[i]; 870 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 871 872 /* first, set the diagonal elements */ 873 AAUp[offset] = 1.0/v[nz]; 874 AALo[offset] = 1.0/v[nz]; 875 876 offset+=1; 877 if (nz>0) { 878 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 879 for (j=offset; j<offset+nz; j++) { 880 AAUp[j] = -AAUp[j]; 881 AALo[j] = AAUp[j]/v[nz]; 882 } 883 offset+=nz; 884 } 885 } 886 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 887 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 888 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 889 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 890 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 891 } 892 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 893 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 894 } catch(char *ex) { 895 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 896 } 897 } 898 PetscFunctionReturn(0); 899 } 900 901 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 902 { 903 PetscErrorCode ierr; 904 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 905 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 906 IS ip = a->row; 907 PetscBool perm_identity; 908 PetscInt n = A->rmap->n; 909 910 PetscFunctionBegin; 911 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 912 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 913 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 914 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 915 916 A->offloadmask = PETSC_OFFLOAD_BOTH; 917 918 /* lower triangular indices */ 919 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 920 if (!perm_identity) { 921 IS iip; 922 const PetscInt *irip,*rip; 923 924 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 925 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 926 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 927 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 928 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 929 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 930 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 931 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 932 ierr = ISDestroy(&iip);CHKERRQ(ierr); 933 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 934 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 935 } 936 PetscFunctionReturn(0); 937 } 938 939 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 940 { 941 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 942 IS isrow = b->row,iscol = b->col; 943 PetscBool row_identity,col_identity; 944 PetscErrorCode ierr; 945 946 PetscFunctionBegin; 947 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 948 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 949 B->offloadmask = PETSC_OFFLOAD_CPU; 950 /* determine which version of MatSolve needs to be used. */ 951 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 952 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 953 if (row_identity && col_identity) { 954 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 955 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 956 B->ops->matsolve = NULL; 957 B->ops->matsolvetranspose = NULL; 958 } else { 959 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 960 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 961 B->ops->matsolve = NULL; 962 B->ops->matsolvetranspose = NULL; 963 } 964 965 /* get the triangular factors */ 966 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 967 PetscFunctionReturn(0); 968 } 969 970 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 971 { 972 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 973 IS ip = b->row; 974 PetscBool perm_identity; 975 PetscErrorCode ierr; 976 977 PetscFunctionBegin; 978 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 979 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 980 B->offloadmask = PETSC_OFFLOAD_CPU; 981 /* determine which version of MatSolve needs to be used. */ 982 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 983 if (perm_identity) { 984 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 985 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 986 B->ops->matsolve = NULL; 987 B->ops->matsolvetranspose = NULL; 988 } else { 989 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 990 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 991 B->ops->matsolve = NULL; 992 B->ops->matsolvetranspose = NULL; 993 } 994 995 /* get the triangular factors */ 996 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 997 PetscFunctionReturn(0); 998 } 999 1000 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1001 { 1002 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1003 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1004 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1005 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1006 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1007 cusparseStatus_t stat; 1008 cusparseIndexBase_t indexBase; 1009 cusparseMatrixType_t matrixType; 1010 cusparseFillMode_t fillMode; 1011 cusparseDiagType_t diagType; 1012 cudaError_t cerr; 1013 PetscErrorCode ierr; 1014 1015 PetscFunctionBegin; 1016 /* allocate space for the transpose of the lower triangular factor */ 1017 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1018 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1019 1020 /* set the matrix descriptors of the lower triangular factor */ 1021 matrixType = cusparseGetMatType(loTriFactor->descr); 1022 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1023 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1024 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1025 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1026 1027 /* Create the matrix description */ 1028 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1029 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1030 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1031 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1032 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1033 1034 /* set the operation */ 1035 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1036 1037 /* allocate GPU space for the CSC of the lower triangular factor*/ 1038 loTriFactorT->csrMat = new CsrMatrix; 1039 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1040 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1041 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1042 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1043 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1044 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1045 1046 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1047 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1048 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1049 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1050 loTriFactor->csrMat->values->data().get(), 1051 loTriFactor->csrMat->row_offsets->data().get(), 1052 loTriFactor->csrMat->column_indices->data().get(), 1053 loTriFactorT->csrMat->values->data().get(), 1054 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1055 CUSPARSE_ACTION_NUMERIC,indexBase, 1056 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1057 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1058 #endif 1059 1060 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1061 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1062 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1063 loTriFactor->csrMat->values->data().get(), 1064 loTriFactor->csrMat->row_offsets->data().get(), 1065 loTriFactor->csrMat->column_indices->data().get(), 1066 loTriFactorT->csrMat->values->data().get(), 1067 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1068 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1069 CUSPARSE_ACTION_NUMERIC, indexBase, 1070 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 1071 #else 1072 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1073 CUSPARSE_ACTION_NUMERIC, indexBase 1074 #endif 1075 );CHKERRCUSPARSE(stat); 1076 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1077 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1078 1079 /* Create the solve analysis information */ 1080 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1081 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1082 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1083 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1084 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1085 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1086 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1087 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1088 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1089 #endif 1090 1091 /* perform the solve analysis */ 1092 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1093 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1094 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1095 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 1096 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1097 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1098 #endif 1099 );CHKERRCUSPARSE(stat); 1100 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1101 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1102 1103 /* assign the pointer */ 1104 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1105 1106 /*********************************************/ 1107 /* Now the Transpose of the Upper Tri Factor */ 1108 /*********************************************/ 1109 1110 /* allocate space for the transpose of the upper triangular factor */ 1111 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1112 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1113 1114 /* set the matrix descriptors of the upper triangular factor */ 1115 matrixType = cusparseGetMatType(upTriFactor->descr); 1116 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1117 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1118 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1119 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1120 1121 /* Create the matrix description */ 1122 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1123 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1124 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1125 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1126 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1127 1128 /* set the operation */ 1129 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1130 1131 /* allocate GPU space for the CSC of the upper triangular factor*/ 1132 upTriFactorT->csrMat = new CsrMatrix; 1133 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1134 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1135 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1136 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1137 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1138 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1139 1140 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1141 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1142 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1143 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1144 upTriFactor->csrMat->values->data().get(), 1145 upTriFactor->csrMat->row_offsets->data().get(), 1146 upTriFactor->csrMat->column_indices->data().get(), 1147 upTriFactorT->csrMat->values->data().get(), 1148 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1149 CUSPARSE_ACTION_NUMERIC,indexBase, 1150 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1151 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1152 #endif 1153 1154 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1155 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1156 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1157 upTriFactor->csrMat->values->data().get(), 1158 upTriFactor->csrMat->row_offsets->data().get(), 1159 upTriFactor->csrMat->column_indices->data().get(), 1160 upTriFactorT->csrMat->values->data().get(), 1161 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1162 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1163 CUSPARSE_ACTION_NUMERIC, indexBase, 1164 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1165 #else 1166 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1167 CUSPARSE_ACTION_NUMERIC, indexBase 1168 #endif 1169 );CHKERRCUSPARSE(stat); 1170 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1171 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1172 1173 /* Create the solve analysis information */ 1174 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1175 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1176 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1177 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1178 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1179 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1180 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1181 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1182 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1183 #endif 1184 1185 /* perform the solve analysis */ 1186 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1187 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1188 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1189 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1190 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1191 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1192 #endif 1193 );CHKERRCUSPARSE(stat); 1194 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1195 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1196 1197 /* assign the pointer */ 1198 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1199 PetscFunctionReturn(0); 1200 } 1201 1202 struct PetscScalarToPetscInt 1203 { 1204 __host__ __device__ 1205 PetscInt operator()(PetscScalar s) 1206 { 1207 return (PetscInt)PetscRealPart(s); 1208 } 1209 }; 1210 1211 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 1212 { 1213 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1214 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1215 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1216 cusparseStatus_t stat; 1217 cusparseIndexBase_t indexBase; 1218 cudaError_t err; 1219 PetscErrorCode ierr; 1220 1221 PetscFunctionBegin; 1222 if (!cusparsestruct->transgen || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 1223 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1224 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1225 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct"); 1226 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1227 if (cusparsestruct->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matTranspose struct"); 1228 if (cusparsestruct->transupdated) PetscFunctionReturn(0); 1229 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1230 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1231 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1232 } 1233 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1234 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1235 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1236 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1237 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1238 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1239 1240 /* set alpha and beta */ 1241 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1242 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1243 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1244 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1245 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1246 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1247 1248 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1249 CsrMatrix *matrixT = new CsrMatrix; 1250 matstructT->mat = matrixT; 1251 matrixT->num_rows = A->cmap->n; 1252 matrixT->num_cols = A->rmap->n; 1253 matrixT->num_entries = a->nz; 1254 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1255 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1256 matrixT->values = new THRUSTARRAY(a->nz); 1257 1258 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1259 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1260 1261 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1262 stat = cusparseCreateCsr(&matstructT->matDescr, 1263 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1264 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1265 matrixT->values->data().get(), 1266 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1267 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1268 #endif 1269 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1270 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1271 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1272 #else 1273 CsrMatrix *temp = new CsrMatrix; 1274 CsrMatrix *tempT = new CsrMatrix; 1275 /* First convert HYB to CSR */ 1276 temp->num_rows = A->rmap->n; 1277 temp->num_cols = A->cmap->n; 1278 temp->num_entries = a->nz; 1279 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1280 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1281 temp->values = new THRUSTARRAY(a->nz); 1282 1283 stat = cusparse_hyb2csr(cusparsestruct->handle, 1284 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1285 temp->values->data().get(), 1286 temp->row_offsets->data().get(), 1287 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1288 1289 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1290 tempT->num_rows = A->rmap->n; 1291 tempT->num_cols = A->cmap->n; 1292 tempT->num_entries = a->nz; 1293 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1294 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1295 tempT->values = new THRUSTARRAY(a->nz); 1296 1297 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1298 temp->num_cols, temp->num_entries, 1299 temp->values->data().get(), 1300 temp->row_offsets->data().get(), 1301 temp->column_indices->data().get(), 1302 tempT->values->data().get(), 1303 tempT->column_indices->data().get(), 1304 tempT->row_offsets->data().get(), 1305 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1306 1307 /* Last, convert CSC to HYB */ 1308 cusparseHybMat_t hybMat; 1309 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1310 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1311 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1312 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1313 matstructT->descr, tempT->values->data().get(), 1314 tempT->row_offsets->data().get(), 1315 tempT->column_indices->data().get(), 1316 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1317 1318 /* assign the pointer */ 1319 matstructT->mat = hybMat; 1320 cusparsestruct->transupdated = PETSC_TRUE; 1321 /* delete temporaries */ 1322 if (tempT) { 1323 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1324 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1325 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1326 delete (CsrMatrix*) tempT; 1327 } 1328 if (temp) { 1329 if (temp->values) delete (THRUSTARRAY*) temp->values; 1330 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1331 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1332 delete (CsrMatrix*) temp; 1333 } 1334 #endif 1335 } 1336 } 1337 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1338 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1339 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1340 if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix"); 1341 if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix rows"); 1342 if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix cols"); 1343 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix values"); 1344 if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT"); 1345 if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT rows"); 1346 if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT cols"); 1347 if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT values"); 1348 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1349 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1350 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1351 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1352 } 1353 if (!cusparsestruct->csr2csc_i) { 1354 THRUSTARRAY csr2csc_a(matrix->num_entries); 1355 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1356 1357 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1358 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1359 void *csr2cscBuffer; 1360 size_t csr2cscBufferSize; 1361 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1362 A->cmap->n, matrix->num_entries, 1363 matrix->values->data().get(), 1364 cusparsestruct->rowoffsets_gpu->data().get(), 1365 matrix->column_indices->data().get(), 1366 matrixT->values->data().get(), 1367 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1368 CUSPARSE_ACTION_NUMERIC,indexBase, 1369 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1370 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1371 #endif 1372 1373 stat = cusparse_csr2csc(cusparsestruct->handle, 1374 A->rmap->n,A->cmap->n,matrix->num_entries, 1375 csr2csc_a.data().get(),cusparsestruct->rowoffsets_gpu->data().get(),matrix->column_indices->data().get(), 1376 matrixT->values->data().get(), 1377 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1378 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1379 CUSPARSE_ACTION_NUMERIC,indexBase, 1380 cusparsestruct->csr2cscAlg, csr2cscBuffer 1381 #else 1382 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1383 CUSPARSE_ACTION_NUMERIC, indexBase 1384 #endif 1385 );CHKERRCUSPARSE(stat); 1386 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1387 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1388 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1389 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1390 #endif 1391 } 1392 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1393 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1394 matrixT->values->begin())); 1395 } 1396 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1397 /* the compressed row indices is not used for matTranspose */ 1398 matstructT->cprowIndices = NULL; 1399 /* assign the pointer */ 1400 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1401 cusparsestruct->transupdated = PETSC_TRUE; 1402 PetscFunctionReturn(0); 1403 } 1404 1405 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1406 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1407 { 1408 PetscInt n = xx->map->n; 1409 const PetscScalar *barray; 1410 PetscScalar *xarray; 1411 thrust::device_ptr<const PetscScalar> bGPU; 1412 thrust::device_ptr<PetscScalar> xGPU; 1413 cusparseStatus_t stat; 1414 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1415 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1416 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1417 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1418 PetscErrorCode ierr; 1419 cudaError_t cerr; 1420 1421 PetscFunctionBegin; 1422 /* Analyze the matrix and create the transpose ... on the fly */ 1423 if (!loTriFactorT && !upTriFactorT) { 1424 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1425 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1426 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1427 } 1428 1429 /* Get the GPU pointers */ 1430 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1431 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1432 xGPU = thrust::device_pointer_cast(xarray); 1433 bGPU = thrust::device_pointer_cast(barray); 1434 1435 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1436 /* First, reorder with the row permutation */ 1437 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1438 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1439 xGPU); 1440 1441 /* First, solve U */ 1442 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1443 upTriFactorT->csrMat->num_rows, 1444 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1445 upTriFactorT->csrMat->num_entries, 1446 #endif 1447 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1448 upTriFactorT->csrMat->values->data().get(), 1449 upTriFactorT->csrMat->row_offsets->data().get(), 1450 upTriFactorT->csrMat->column_indices->data().get(), 1451 upTriFactorT->solveInfo, 1452 xarray, tempGPU->data().get() 1453 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1454 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1455 #endif 1456 );CHKERRCUSPARSE(stat); 1457 1458 /* Then, solve L */ 1459 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1460 loTriFactorT->csrMat->num_rows, 1461 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1462 loTriFactorT->csrMat->num_entries, 1463 #endif 1464 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1465 loTriFactorT->csrMat->values->data().get(), 1466 loTriFactorT->csrMat->row_offsets->data().get(), 1467 loTriFactorT->csrMat->column_indices->data().get(), 1468 loTriFactorT->solveInfo, 1469 tempGPU->data().get(), xarray 1470 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1471 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1472 #endif 1473 );CHKERRCUSPARSE(stat); 1474 1475 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1476 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1477 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1478 tempGPU->begin()); 1479 1480 /* Copy the temporary to the full solution. */ 1481 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1482 1483 /* restore */ 1484 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1485 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1486 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1487 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1488 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1489 PetscFunctionReturn(0); 1490 } 1491 1492 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1493 { 1494 const PetscScalar *barray; 1495 PetscScalar *xarray; 1496 cusparseStatus_t stat; 1497 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1498 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1499 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1500 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1501 PetscErrorCode ierr; 1502 cudaError_t cerr; 1503 1504 PetscFunctionBegin; 1505 /* Analyze the matrix and create the transpose ... on the fly */ 1506 if (!loTriFactorT && !upTriFactorT) { 1507 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1508 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1509 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1510 } 1511 1512 /* Get the GPU pointers */ 1513 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1514 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1515 1516 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1517 /* First, solve U */ 1518 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1519 upTriFactorT->csrMat->num_rows, 1520 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1521 upTriFactorT->csrMat->num_entries, 1522 #endif 1523 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1524 upTriFactorT->csrMat->values->data().get(), 1525 upTriFactorT->csrMat->row_offsets->data().get(), 1526 upTriFactorT->csrMat->column_indices->data().get(), 1527 upTriFactorT->solveInfo, 1528 barray, tempGPU->data().get() 1529 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1530 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1531 #endif 1532 );CHKERRCUSPARSE(stat); 1533 1534 /* Then, solve L */ 1535 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1536 loTriFactorT->csrMat->num_rows, 1537 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1538 loTriFactorT->csrMat->num_entries, 1539 #endif 1540 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1541 loTriFactorT->csrMat->values->data().get(), 1542 loTriFactorT->csrMat->row_offsets->data().get(), 1543 loTriFactorT->csrMat->column_indices->data().get(), 1544 loTriFactorT->solveInfo, 1545 tempGPU->data().get(), xarray 1546 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1547 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1548 #endif 1549 );CHKERRCUSPARSE(stat); 1550 1551 /* restore */ 1552 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1553 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1554 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1555 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1556 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1557 PetscFunctionReturn(0); 1558 } 1559 1560 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1561 { 1562 const PetscScalar *barray; 1563 PetscScalar *xarray; 1564 thrust::device_ptr<const PetscScalar> bGPU; 1565 thrust::device_ptr<PetscScalar> xGPU; 1566 cusparseStatus_t stat; 1567 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1568 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1569 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1570 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1571 PetscErrorCode ierr; 1572 cudaError_t cerr; 1573 1574 PetscFunctionBegin; 1575 1576 /* Get the GPU pointers */ 1577 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1578 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1579 xGPU = thrust::device_pointer_cast(xarray); 1580 bGPU = thrust::device_pointer_cast(barray); 1581 1582 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1583 /* First, reorder with the row permutation */ 1584 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1585 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1586 tempGPU->begin()); 1587 1588 /* Next, solve L */ 1589 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1590 loTriFactor->csrMat->num_rows, 1591 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1592 loTriFactor->csrMat->num_entries, 1593 #endif 1594 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1595 loTriFactor->csrMat->values->data().get(), 1596 loTriFactor->csrMat->row_offsets->data().get(), 1597 loTriFactor->csrMat->column_indices->data().get(), 1598 loTriFactor->solveInfo, 1599 tempGPU->data().get(), xarray 1600 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1601 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1602 #endif 1603 );CHKERRCUSPARSE(stat); 1604 1605 /* Then, solve U */ 1606 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1607 upTriFactor->csrMat->num_rows, 1608 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1609 upTriFactor->csrMat->num_entries, 1610 #endif 1611 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1612 upTriFactor->csrMat->values->data().get(), 1613 upTriFactor->csrMat->row_offsets->data().get(), 1614 upTriFactor->csrMat->column_indices->data().get(), 1615 upTriFactor->solveInfo, 1616 xarray, tempGPU->data().get() 1617 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1618 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1619 #endif 1620 );CHKERRCUSPARSE(stat); 1621 1622 /* Last, reorder with the column permutation */ 1623 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1624 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1625 xGPU); 1626 1627 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1628 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1629 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1630 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1631 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1632 PetscFunctionReturn(0); 1633 } 1634 1635 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1636 { 1637 const PetscScalar *barray; 1638 PetscScalar *xarray; 1639 cusparseStatus_t stat; 1640 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1641 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1642 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1643 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1644 PetscErrorCode ierr; 1645 cudaError_t cerr; 1646 1647 PetscFunctionBegin; 1648 /* Get the GPU pointers */ 1649 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1650 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1651 1652 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1653 /* First, solve L */ 1654 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1655 loTriFactor->csrMat->num_rows, 1656 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1657 loTriFactor->csrMat->num_entries, 1658 #endif 1659 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1660 loTriFactor->csrMat->values->data().get(), 1661 loTriFactor->csrMat->row_offsets->data().get(), 1662 loTriFactor->csrMat->column_indices->data().get(), 1663 loTriFactor->solveInfo, 1664 barray, tempGPU->data().get() 1665 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1666 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1667 #endif 1668 );CHKERRCUSPARSE(stat); 1669 1670 /* Next, solve U */ 1671 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1672 upTriFactor->csrMat->num_rows, 1673 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1674 upTriFactor->csrMat->num_entries, 1675 #endif 1676 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1677 upTriFactor->csrMat->values->data().get(), 1678 upTriFactor->csrMat->row_offsets->data().get(), 1679 upTriFactor->csrMat->column_indices->data().get(), 1680 upTriFactor->solveInfo, 1681 tempGPU->data().get(), xarray 1682 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1683 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1684 #endif 1685 );CHKERRCUSPARSE(stat); 1686 1687 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1688 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1689 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1690 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1691 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1692 PetscFunctionReturn(0); 1693 } 1694 1695 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1696 { 1697 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1698 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1699 cudaError_t cerr; 1700 PetscErrorCode ierr; 1701 1702 PetscFunctionBegin; 1703 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1704 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1705 1706 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1707 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1708 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1709 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1710 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1711 A->offloadmask = PETSC_OFFLOAD_BOTH; 1712 } 1713 PetscFunctionReturn(0); 1714 } 1715 1716 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1717 { 1718 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1719 PetscErrorCode ierr; 1720 1721 PetscFunctionBegin; 1722 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1723 *array = a->a; 1724 A->offloadmask = PETSC_OFFLOAD_CPU; 1725 PetscFunctionReturn(0); 1726 } 1727 1728 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1729 { 1730 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1731 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1732 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1733 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1734 PetscErrorCode ierr; 1735 cusparseStatus_t stat; 1736 PetscBool both = PETSC_TRUE; 1737 cudaError_t err; 1738 1739 PetscFunctionBegin; 1740 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU"); 1741 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1742 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1743 CsrMatrix *matrix; 1744 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1745 1746 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values"); 1747 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1748 matrix->values->assign(a->a, a->a+a->nz); 1749 err = WaitForCUDA();CHKERRCUDA(err); 1750 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1751 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1752 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1753 } else { 1754 PetscInt nnz; 1755 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1756 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1757 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1758 delete cusparsestruct->workVector; 1759 delete cusparsestruct->rowoffsets_gpu; 1760 cusparsestruct->workVector = NULL; 1761 cusparsestruct->rowoffsets_gpu = NULL; 1762 try { 1763 if (a->compressedrow.use) { 1764 m = a->compressedrow.nrows; 1765 ii = a->compressedrow.i; 1766 ridx = a->compressedrow.rindex; 1767 } else { 1768 m = A->rmap->n; 1769 ii = a->i; 1770 ridx = NULL; 1771 } 1772 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data"); 1773 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data"); 1774 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1775 else nnz = a->nz; 1776 1777 /* create cusparse matrix */ 1778 cusparsestruct->nrows = m; 1779 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1780 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1781 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1782 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1783 1784 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1785 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1786 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1787 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1788 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1789 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1790 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1791 1792 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1793 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1794 /* set the matrix */ 1795 CsrMatrix *mat= new CsrMatrix; 1796 mat->num_rows = m; 1797 mat->num_cols = A->cmap->n; 1798 mat->num_entries = nnz; 1799 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1800 mat->row_offsets->assign(ii, ii + m+1); 1801 1802 mat->column_indices = new THRUSTINTARRAY32(nnz); 1803 mat->column_indices->assign(a->j, a->j+nnz); 1804 1805 mat->values = new THRUSTARRAY(nnz); 1806 if (a->a) mat->values->assign(a->a, a->a+nnz); 1807 1808 /* assign the pointer */ 1809 matstruct->mat = mat; 1810 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1811 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1812 stat = cusparseCreateCsr(&matstruct->matDescr, 1813 mat->num_rows, mat->num_cols, mat->num_entries, 1814 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1815 mat->values->data().get(), 1816 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1817 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1818 } 1819 #endif 1820 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1821 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1822 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1823 #else 1824 CsrMatrix *mat= new CsrMatrix; 1825 mat->num_rows = m; 1826 mat->num_cols = A->cmap->n; 1827 mat->num_entries = nnz; 1828 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1829 mat->row_offsets->assign(ii, ii + m+1); 1830 1831 mat->column_indices = new THRUSTINTARRAY32(nnz); 1832 mat->column_indices->assign(a->j, a->j+nnz); 1833 1834 mat->values = new THRUSTARRAY(nnz); 1835 if (a->a) mat->values->assign(a->a, a->a+nnz); 1836 1837 cusparseHybMat_t hybMat; 1838 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1839 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1840 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1841 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1842 matstruct->descr, mat->values->data().get(), 1843 mat->row_offsets->data().get(), 1844 mat->column_indices->data().get(), 1845 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1846 /* assign the pointer */ 1847 matstruct->mat = hybMat; 1848 1849 if (mat) { 1850 if (mat->values) delete (THRUSTARRAY*)mat->values; 1851 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1852 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1853 delete (CsrMatrix*)mat; 1854 } 1855 #endif 1856 } 1857 1858 /* assign the compressed row indices */ 1859 if (a->compressedrow.use) { 1860 cusparsestruct->workVector = new THRUSTARRAY(m); 1861 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1862 matstruct->cprowIndices->assign(ridx,ridx+m); 1863 tmp = m; 1864 } else { 1865 cusparsestruct->workVector = NULL; 1866 matstruct->cprowIndices = NULL; 1867 tmp = 0; 1868 } 1869 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1870 1871 /* assign the pointer */ 1872 cusparsestruct->mat = matstruct; 1873 } catch(char *ex) { 1874 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1875 } 1876 err = WaitForCUDA();CHKERRCUDA(err); 1877 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1878 cusparsestruct->nonzerostate = A->nonzerostate; 1879 } 1880 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1881 } 1882 PetscFunctionReturn(0); 1883 } 1884 1885 struct VecCUDAPlusEquals 1886 { 1887 template <typename Tuple> 1888 __host__ __device__ 1889 void operator()(Tuple t) 1890 { 1891 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1892 } 1893 }; 1894 1895 struct VecCUDAEquals 1896 { 1897 template <typename Tuple> 1898 __host__ __device__ 1899 void operator()(Tuple t) 1900 { 1901 thrust::get<1>(t) = thrust::get<0>(t); 1902 } 1903 }; 1904 1905 struct VecCUDAEqualsReverse 1906 { 1907 template <typename Tuple> 1908 __host__ __device__ 1909 void operator()(Tuple t) 1910 { 1911 thrust::get<0>(t) = thrust::get<1>(t); 1912 } 1913 }; 1914 1915 struct MatMatCusparse { 1916 PetscBool cisdense; 1917 PetscScalar *Bt; 1918 Mat X; 1919 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1920 PetscLogDouble flops; 1921 CsrMatrix *Bcsr; 1922 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1923 cusparseSpMatDescr_t matSpBDescr; 1924 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1925 cusparseDnMatDescr_t matBDescr; 1926 cusparseDnMatDescr_t matCDescr; 1927 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1928 size_t mmBufferSize; 1929 void *mmBuffer; 1930 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1931 cusparseSpGEMMDescr_t spgemmDesc; 1932 #endif 1933 }; 1934 1935 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1936 { 1937 PetscErrorCode ierr; 1938 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1939 cudaError_t cerr; 1940 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1941 cusparseStatus_t stat; 1942 #endif 1943 1944 PetscFunctionBegin; 1945 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1946 delete mmdata->Bcsr; 1947 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1948 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1949 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1950 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1951 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1952 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1953 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1954 #endif 1955 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1956 ierr = PetscFree(data);CHKERRQ(ierr); 1957 PetscFunctionReturn(0); 1958 } 1959 1960 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1961 1962 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1963 { 1964 Mat_Product *product = C->product; 1965 Mat A,B; 1966 PetscInt m,n,blda,clda; 1967 PetscBool flg,biscuda; 1968 Mat_SeqAIJCUSPARSE *cusp; 1969 cusparseStatus_t stat; 1970 cusparseOperation_t opA; 1971 const PetscScalar *barray; 1972 PetscScalar *carray; 1973 PetscErrorCode ierr; 1974 MatMatCusparse *mmdata; 1975 Mat_SeqAIJCUSPARSEMultStruct *mat; 1976 CsrMatrix *csrmat; 1977 cudaError_t cerr; 1978 1979 PetscFunctionBegin; 1980 MatCheckProduct(C,1); 1981 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1982 mmdata = (MatMatCusparse*)product->data; 1983 A = product->A; 1984 B = product->B; 1985 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1986 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1987 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1988 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1989 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1990 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1991 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1992 switch (product->type) { 1993 case MATPRODUCT_AB: 1994 case MATPRODUCT_PtAP: 1995 mat = cusp->mat; 1996 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1997 m = A->rmap->n; 1998 n = B->cmap->n; 1999 break; 2000 case MATPRODUCT_AtB: 2001 if (!cusp->transgen) { 2002 mat = cusp->mat; 2003 opA = CUSPARSE_OPERATION_TRANSPOSE; 2004 } else { 2005 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 2006 mat = cusp->matTranspose; 2007 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2008 } 2009 m = A->cmap->n; 2010 n = B->cmap->n; 2011 break; 2012 case MATPRODUCT_ABt: 2013 case MATPRODUCT_RARt: 2014 mat = cusp->mat; 2015 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2016 m = A->rmap->n; 2017 n = B->rmap->n; 2018 break; 2019 default: 2020 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2021 } 2022 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2023 csrmat = (CsrMatrix*)mat->mat; 2024 /* if the user passed a CPU matrix, copy the data to the GPU */ 2025 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2026 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2027 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2028 2029 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2030 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2031 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2032 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2033 } else { 2034 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2035 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2036 } 2037 2038 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2039 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2040 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2041 /* (re)allcoate mmBuffer if not initialized or LDAs are different */ 2042 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2043 size_t mmBufferSize; 2044 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2045 if (!mmdata->matBDescr) { 2046 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2047 mmdata->Blda = blda; 2048 } 2049 2050 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2051 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2052 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2053 mmdata->Clda = clda; 2054 } 2055 2056 if (!mat->matDescr) { 2057 stat = cusparseCreateCsr(&mat->matDescr, 2058 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2059 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2060 csrmat->values->data().get(), 2061 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2062 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2063 } 2064 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2065 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2066 mmdata->matCDescr,cusparse_scalartype, 2067 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2068 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2069 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2070 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2071 mmdata->mmBufferSize = mmBufferSize; 2072 } 2073 mmdata->initialized = PETSC_TRUE; 2074 } else { 2075 /* to be safe, always update pointers of the mats */ 2076 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2077 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2078 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2079 } 2080 2081 /* do cusparseSpMM, which supports transpose on B */ 2082 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2083 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2084 mmdata->matCDescr,cusparse_scalartype, 2085 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2086 #else 2087 PetscInt k; 2088 /* cusparseXcsrmm does not support transpose on B */ 2089 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2090 cublasHandle_t cublasv2handle; 2091 cublasStatus_t cerr; 2092 2093 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2094 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2095 B->cmap->n,B->rmap->n, 2096 &PETSC_CUSPARSE_ONE ,barray,blda, 2097 &PETSC_CUSPARSE_ZERO,barray,blda, 2098 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2099 blda = B->cmap->n; 2100 k = B->cmap->n; 2101 } else { 2102 k = B->rmap->n; 2103 } 2104 2105 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2106 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2107 csrmat->num_entries,mat->alpha_one,mat->descr, 2108 csrmat->values->data().get(), 2109 csrmat->row_offsets->data().get(), 2110 csrmat->column_indices->data().get(), 2111 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2112 carray,clda);CHKERRCUSPARSE(stat); 2113 #endif 2114 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2115 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2116 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2117 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2118 if (product->type == MATPRODUCT_RARt) { 2119 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2120 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2121 } else if (product->type == MATPRODUCT_PtAP) { 2122 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2123 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2124 } else { 2125 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2126 } 2127 if (mmdata->cisdense) { 2128 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2129 } 2130 if (!biscuda) { 2131 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2132 } 2133 PetscFunctionReturn(0); 2134 } 2135 2136 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2137 { 2138 Mat_Product *product = C->product; 2139 Mat A,B; 2140 PetscInt m,n; 2141 PetscBool cisdense,flg; 2142 PetscErrorCode ierr; 2143 MatMatCusparse *mmdata; 2144 Mat_SeqAIJCUSPARSE *cusp; 2145 2146 PetscFunctionBegin; 2147 MatCheckProduct(C,1); 2148 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2149 A = product->A; 2150 B = product->B; 2151 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2152 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2153 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2154 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2155 switch (product->type) { 2156 case MATPRODUCT_AB: 2157 m = A->rmap->n; 2158 n = B->cmap->n; 2159 break; 2160 case MATPRODUCT_AtB: 2161 m = A->cmap->n; 2162 n = B->cmap->n; 2163 break; 2164 case MATPRODUCT_ABt: 2165 m = A->rmap->n; 2166 n = B->rmap->n; 2167 break; 2168 case MATPRODUCT_PtAP: 2169 m = B->cmap->n; 2170 n = B->cmap->n; 2171 break; 2172 case MATPRODUCT_RARt: 2173 m = B->rmap->n; 2174 n = B->rmap->n; 2175 break; 2176 default: 2177 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2178 } 2179 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2180 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2181 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2182 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2183 2184 /* product data */ 2185 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2186 mmdata->cisdense = cisdense; 2187 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2188 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2189 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2190 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2191 } 2192 #endif 2193 /* for these products we need intermediate storage */ 2194 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2195 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2196 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2197 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2198 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2199 } else { 2200 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2201 } 2202 } 2203 C->product->data = mmdata; 2204 C->product->destroy = MatDestroy_MatMatCusparse; 2205 2206 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2207 PetscFunctionReturn(0); 2208 } 2209 2210 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2211 { 2212 Mat_Product *product = C->product; 2213 Mat A,B; 2214 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2215 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2216 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2217 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2218 PetscBool flg; 2219 PetscErrorCode ierr; 2220 cusparseStatus_t stat; 2221 cudaError_t cerr; 2222 MatProductType ptype; 2223 MatMatCusparse *mmdata; 2224 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2225 cusparseSpMatDescr_t BmatSpDescr; 2226 #endif 2227 2228 PetscFunctionBegin; 2229 MatCheckProduct(C,1); 2230 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 2231 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2232 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name); 2233 mmdata = (MatMatCusparse*)C->product->data; 2234 A = product->A; 2235 B = product->B; 2236 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2237 mmdata->reusesym = PETSC_FALSE; 2238 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2239 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2240 Cmat = Ccusp->mat; 2241 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2242 Ccsr = (CsrMatrix*)Cmat->mat; 2243 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2244 goto finalize; 2245 } 2246 if (!c->nz) goto finalize; 2247 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2248 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2249 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2250 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2251 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2252 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2253 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2254 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2255 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2256 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2257 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2258 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2259 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2260 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2261 2262 ptype = product->type; 2263 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2264 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2265 switch (ptype) { 2266 case MATPRODUCT_AB: 2267 Amat = Acusp->mat; 2268 Bmat = Bcusp->mat; 2269 break; 2270 case MATPRODUCT_AtB: 2271 Amat = Acusp->matTranspose; 2272 Bmat = Bcusp->mat; 2273 break; 2274 case MATPRODUCT_ABt: 2275 Amat = Acusp->mat; 2276 Bmat = Bcusp->matTranspose; 2277 break; 2278 default: 2279 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2280 } 2281 Cmat = Ccusp->mat; 2282 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2283 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2284 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2285 Acsr = (CsrMatrix*)Amat->mat; 2286 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2287 Ccsr = (CsrMatrix*)Cmat->mat; 2288 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2289 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2290 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2291 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2292 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2293 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2294 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2295 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2296 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2297 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2298 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2299 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2300 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2301 #else 2302 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2303 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2304 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2305 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2306 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2307 #endif 2308 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2309 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2310 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2311 C->offloadmask = PETSC_OFFLOAD_GPU; 2312 finalize: 2313 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2314 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2315 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2316 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2317 c->reallocs = 0; 2318 C->info.mallocs += 0; 2319 C->info.nz_unneeded = 0; 2320 C->assembled = C->was_assembled = PETSC_TRUE; 2321 C->num_ass++; 2322 PetscFunctionReturn(0); 2323 } 2324 2325 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2326 { 2327 Mat_Product *product = C->product; 2328 Mat A,B; 2329 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2330 Mat_SeqAIJ *a,*b,*c; 2331 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2332 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2333 PetscInt i,j,m,n,k; 2334 PetscBool flg; 2335 PetscErrorCode ierr; 2336 cusparseStatus_t stat; 2337 cudaError_t cerr; 2338 MatProductType ptype; 2339 MatMatCusparse *mmdata; 2340 PetscLogDouble flops; 2341 PetscBool biscompressed,ciscompressed; 2342 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2343 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2344 size_t bufSize2; 2345 cusparseSpMatDescr_t BmatSpDescr; 2346 #else 2347 int cnz; 2348 #endif 2349 2350 PetscFunctionBegin; 2351 MatCheckProduct(C,1); 2352 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2353 A = product->A; 2354 B = product->B; 2355 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2356 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2357 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2358 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2359 a = (Mat_SeqAIJ*)A->data; 2360 b = (Mat_SeqAIJ*)B->data; 2361 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2362 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2363 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2364 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2365 2366 /* product data */ 2367 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2368 C->product->data = mmdata; 2369 C->product->destroy = MatDestroy_MatMatCusparse; 2370 2371 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2372 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2373 ptype = product->type; 2374 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2375 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2376 biscompressed = PETSC_FALSE; 2377 ciscompressed = PETSC_FALSE; 2378 switch (ptype) { 2379 case MATPRODUCT_AB: 2380 m = A->rmap->n; 2381 n = B->cmap->n; 2382 k = A->cmap->n; 2383 Amat = Acusp->mat; 2384 Bmat = Bcusp->mat; 2385 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2386 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2387 break; 2388 case MATPRODUCT_AtB: 2389 m = A->cmap->n; 2390 n = B->cmap->n; 2391 k = A->rmap->n; 2392 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 2393 Amat = Acusp->matTranspose; 2394 Bmat = Bcusp->mat; 2395 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2396 break; 2397 case MATPRODUCT_ABt: 2398 m = A->rmap->n; 2399 n = B->rmap->n; 2400 k = A->cmap->n; 2401 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr); 2402 Amat = Acusp->mat; 2403 Bmat = Bcusp->matTranspose; 2404 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2405 break; 2406 default: 2407 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2408 } 2409 2410 /* create cusparse matrix */ 2411 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2412 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2413 c = (Mat_SeqAIJ*)C->data; 2414 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2415 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2416 Ccsr = new CsrMatrix; 2417 2418 c->compressedrow.use = ciscompressed; 2419 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2420 c->compressedrow.nrows = a->compressedrow.nrows; 2421 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2422 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2423 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2424 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2425 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2426 } else { 2427 c->compressedrow.nrows = 0; 2428 c->compressedrow.i = NULL; 2429 c->compressedrow.rindex = NULL; 2430 Ccusp->workVector = NULL; 2431 Cmat->cprowIndices = NULL; 2432 } 2433 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2434 Ccusp->mat = Cmat; 2435 Ccusp->mat->mat = Ccsr; 2436 Ccsr->num_rows = Ccusp->nrows; 2437 Ccsr->num_cols = n; 2438 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2439 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2440 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2441 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2442 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2443 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2444 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2445 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2446 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2447 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2448 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2449 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2450 c->nz = 0; 2451 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2452 Ccsr->values = new THRUSTARRAY(c->nz); 2453 goto finalizesym; 2454 } 2455 2456 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2457 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2458 Acsr = (CsrMatrix*)Amat->mat; 2459 if (!biscompressed) { 2460 Bcsr = (CsrMatrix*)Bmat->mat; 2461 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2462 BmatSpDescr = Bmat->matDescr; 2463 #endif 2464 } else { /* we need to use row offsets for the full matrix */ 2465 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2466 Bcsr = new CsrMatrix; 2467 Bcsr->num_rows = B->rmap->n; 2468 Bcsr->num_cols = cBcsr->num_cols; 2469 Bcsr->num_entries = cBcsr->num_entries; 2470 Bcsr->column_indices = cBcsr->column_indices; 2471 Bcsr->values = cBcsr->values; 2472 if (!Bcusp->rowoffsets_gpu) { 2473 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2474 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2475 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2476 } 2477 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2478 mmdata->Bcsr = Bcsr; 2479 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2480 if (Bcsr->num_rows && Bcsr->num_cols) { 2481 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2482 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2483 Bcsr->values->data().get(), 2484 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2485 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2486 } 2487 BmatSpDescr = mmdata->matSpBDescr; 2488 #endif 2489 } 2490 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2491 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2492 /* precompute flops count */ 2493 if (ptype == MATPRODUCT_AB) { 2494 for (i=0, flops = 0; i<A->rmap->n; i++) { 2495 const PetscInt st = a->i[i]; 2496 const PetscInt en = a->i[i+1]; 2497 for (j=st; j<en; j++) { 2498 const PetscInt brow = a->j[j]; 2499 flops += 2.*(b->i[brow+1] - b->i[brow]); 2500 } 2501 } 2502 } else if (ptype == MATPRODUCT_AtB) { 2503 for (i=0, flops = 0; i<A->rmap->n; i++) { 2504 const PetscInt anzi = a->i[i+1] - a->i[i]; 2505 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2506 flops += (2.*anzi)*bnzi; 2507 } 2508 } else { /* TODO */ 2509 flops = 0.; 2510 } 2511 2512 mmdata->flops = flops; 2513 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2514 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2515 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2516 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2517 NULL, NULL, NULL, 2518 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2519 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2520 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2521 /* ask bufferSize bytes for external memory */ 2522 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2523 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2524 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2525 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2526 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2527 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2528 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2529 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2530 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2531 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2532 /* ask bufferSize again bytes for external memory */ 2533 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2534 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2535 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2536 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2537 /* The CUSPARSE documentation is not clear, nor the API 2538 We need both buffers to perform the operations properly! 2539 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2540 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2541 is stored in the descriptor! What a messy API... */ 2542 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2543 /* compute the intermediate product of A * B */ 2544 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2545 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2546 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2547 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2548 /* get matrix C non-zero entries C_nnz1 */ 2549 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2550 c->nz = (PetscInt) C_nnz1; 2551 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2552 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2553 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2554 Ccsr->values = new THRUSTARRAY(c->nz); 2555 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2556 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2557 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2558 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2559 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2560 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2561 #else 2562 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2563 stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2564 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2565 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2566 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2567 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2568 c->nz = cnz; 2569 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2570 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2571 Ccsr->values = new THRUSTARRAY(c->nz); 2572 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2573 2574 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2575 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2576 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2577 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2578 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2579 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2580 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2581 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2582 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2583 #endif 2584 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2585 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2586 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2587 finalizesym: 2588 c->singlemalloc = PETSC_FALSE; 2589 c->free_a = PETSC_TRUE; 2590 c->free_ij = PETSC_TRUE; 2591 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2592 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2593 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2594 PetscInt *d_i = c->i; 2595 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2596 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2597 ii = *Ccsr->row_offsets; 2598 jj = *Ccsr->column_indices; 2599 if (ciscompressed) d_i = c->compressedrow.i; 2600 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2601 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2602 } else { 2603 PetscInt *d_i = c->i; 2604 if (ciscompressed) d_i = c->compressedrow.i; 2605 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2606 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2607 } 2608 if (ciscompressed) { /* need to expand host row offsets */ 2609 PetscInt r = 0; 2610 c->i[0] = 0; 2611 for (k = 0; k < c->compressedrow.nrows; k++) { 2612 const PetscInt next = c->compressedrow.rindex[k]; 2613 const PetscInt old = c->compressedrow.i[k]; 2614 for (; r < next; r++) c->i[r+1] = old; 2615 } 2616 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2617 } 2618 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2619 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2620 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2621 c->maxnz = c->nz; 2622 c->nonzerorowcnt = 0; 2623 c->rmax = 0; 2624 for (k = 0; k < m; k++) { 2625 const PetscInt nn = c->i[k+1] - c->i[k]; 2626 c->ilen[k] = c->imax[k] = nn; 2627 c->nonzerorowcnt += (PetscInt)!!nn; 2628 c->rmax = PetscMax(c->rmax,nn); 2629 } 2630 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2631 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2632 Ccsr->num_entries = c->nz; 2633 2634 C->nonzerostate++; 2635 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2636 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2637 Ccusp->nonzerostate = C->nonzerostate; 2638 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2639 C->preallocated = PETSC_TRUE; 2640 C->assembled = PETSC_FALSE; 2641 C->was_assembled = PETSC_FALSE; 2642 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2643 mmdata->reusesym = PETSC_TRUE; 2644 C->offloadmask = PETSC_OFFLOAD_GPU; 2645 } 2646 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2647 PetscFunctionReturn(0); 2648 } 2649 2650 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2651 2652 /* handles sparse or dense B */ 2653 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2654 { 2655 Mat_Product *product = mat->product; 2656 PetscErrorCode ierr; 2657 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2658 2659 PetscFunctionBegin; 2660 MatCheckProduct(mat,1); 2661 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2662 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2663 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2664 } 2665 if (product->type == MATPRODUCT_ABC) { 2666 Ciscusp = PETSC_FALSE; 2667 if (!product->C->boundtocpu) { 2668 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2669 } 2670 } 2671 if (isdense) { 2672 switch (product->type) { 2673 case MATPRODUCT_AB: 2674 case MATPRODUCT_AtB: 2675 case MATPRODUCT_ABt: 2676 case MATPRODUCT_PtAP: 2677 case MATPRODUCT_RARt: 2678 if (product->A->boundtocpu) { 2679 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2680 } else { 2681 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2682 } 2683 break; 2684 case MATPRODUCT_ABC: 2685 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2686 break; 2687 default: 2688 break; 2689 } 2690 } else if (Biscusp && Ciscusp) { 2691 switch (product->type) { 2692 case MATPRODUCT_AB: 2693 case MATPRODUCT_AtB: 2694 case MATPRODUCT_ABt: 2695 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2696 break; 2697 case MATPRODUCT_PtAP: 2698 case MATPRODUCT_RARt: 2699 case MATPRODUCT_ABC: 2700 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2701 break; 2702 default: 2703 break; 2704 } 2705 } else { /* fallback for AIJ */ 2706 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2707 } 2708 PetscFunctionReturn(0); 2709 } 2710 2711 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2712 { 2713 PetscErrorCode ierr; 2714 2715 PetscFunctionBegin; 2716 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2717 PetscFunctionReturn(0); 2718 } 2719 2720 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2721 { 2722 PetscErrorCode ierr; 2723 2724 PetscFunctionBegin; 2725 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2726 PetscFunctionReturn(0); 2727 } 2728 2729 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2730 { 2731 PetscErrorCode ierr; 2732 2733 PetscFunctionBegin; 2734 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2735 PetscFunctionReturn(0); 2736 } 2737 2738 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2739 { 2740 PetscErrorCode ierr; 2741 2742 PetscFunctionBegin; 2743 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2744 PetscFunctionReturn(0); 2745 } 2746 2747 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2748 { 2749 PetscErrorCode ierr; 2750 2751 PetscFunctionBegin; 2752 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2753 PetscFunctionReturn(0); 2754 } 2755 2756 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2757 { 2758 int i = blockIdx.x*blockDim.x + threadIdx.x; 2759 if (i < n) y[idx[i]] += x[i]; 2760 } 2761 2762 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2763 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2764 { 2765 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2766 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2767 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2768 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2769 PetscErrorCode ierr; 2770 cudaError_t cerr; 2771 cusparseStatus_t stat; 2772 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2773 PetscBool compressed; 2774 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2775 PetscInt nx,ny; 2776 #endif 2777 2778 PetscFunctionBegin; 2779 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2780 if (!a->nonzerorowcnt) { 2781 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2782 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2783 PetscFunctionReturn(0); 2784 } 2785 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2786 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2787 if (!trans) { 2788 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2789 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2790 } else { 2791 if (herm || !cusparsestruct->transgen) { 2792 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2793 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2794 } else { 2795 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);} 2796 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2797 } 2798 } 2799 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2800 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2801 2802 try { 2803 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2804 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2805 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2806 2807 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2808 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2809 /* z = A x + beta y. 2810 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2811 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2812 */ 2813 xptr = xarray; 2814 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2815 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2816 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2817 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2818 allocated to accommodate different uses. So we get the length info directly from mat. 2819 */ 2820 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2821 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2822 nx = mat->num_cols; 2823 ny = mat->num_rows; 2824 } 2825 #endif 2826 } else { 2827 /* z = A^T x + beta y 2828 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2829 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2830 */ 2831 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2832 dptr = zarray; 2833 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2834 if (compressed) { /* Scatter x to work vector */ 2835 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2836 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2837 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2838 VecCUDAEqualsReverse()); 2839 } 2840 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2841 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2842 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2843 nx = mat->num_rows; 2844 ny = mat->num_cols; 2845 } 2846 #endif 2847 } 2848 2849 /* csr_spmv does y = alpha op(A) x + beta y */ 2850 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2851 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2852 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2853 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2854 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2855 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2856 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2857 matstruct->matDescr, 2858 matstruct->cuSpMV[opA].vecXDescr, beta, 2859 matstruct->cuSpMV[opA].vecYDescr, 2860 cusparse_scalartype, 2861 cusparsestruct->spmvAlg, 2862 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2863 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2864 2865 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2866 } else { 2867 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2868 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2869 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2870 } 2871 2872 stat = cusparseSpMV(cusparsestruct->handle, opA, 2873 matstruct->alpha_one, 2874 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */ 2875 matstruct->cuSpMV[opA].vecXDescr, 2876 beta, 2877 matstruct->cuSpMV[opA].vecYDescr, 2878 cusparse_scalartype, 2879 cusparsestruct->spmvAlg, 2880 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2881 #else 2882 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2883 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2884 mat->num_rows, mat->num_cols, 2885 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2886 mat->values->data().get(), mat->row_offsets->data().get(), 2887 mat->column_indices->data().get(), xptr, beta, 2888 dptr);CHKERRCUSPARSE(stat); 2889 #endif 2890 } else { 2891 if (cusparsestruct->nrows) { 2892 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2893 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2894 #else 2895 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2896 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2897 matstruct->alpha_one, matstruct->descr, hybMat, 2898 xptr, beta, 2899 dptr);CHKERRCUSPARSE(stat); 2900 #endif 2901 } 2902 } 2903 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2904 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2905 2906 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2907 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2908 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2909 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2910 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2911 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2912 } 2913 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2914 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2915 } 2916 2917 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2918 if (compressed) { 2919 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2920 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 2921 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 2922 prevent that. So I just add a ScatterAdd kernel. 2923 */ 2924 #if 0 2925 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2926 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 2927 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2928 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2929 VecCUDAPlusEquals()); 2930 #else 2931 PetscInt n = matstruct->cprowIndices->size(); 2932 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 2933 #endif 2934 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2935 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2936 } 2937 } else { 2938 if (yy && yy != zz) { 2939 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2940 } 2941 } 2942 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2943 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2944 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2945 } catch(char *ex) { 2946 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2947 } 2948 if (yy) { 2949 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2950 } else { 2951 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2952 } 2953 PetscFunctionReturn(0); 2954 } 2955 2956 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2957 { 2958 PetscErrorCode ierr; 2959 2960 PetscFunctionBegin; 2961 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2962 PetscFunctionReturn(0); 2963 } 2964 2965 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2966 { 2967 PetscErrorCode ierr; 2968 PetscSplitCSRDataStructure *d_mat = NULL; 2969 PetscFunctionBegin; 2970 if (A->factortype == MAT_FACTOR_NONE) { 2971 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2972 } 2973 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2974 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2975 if (d_mat) { 2976 A->offloadmask = PETSC_OFFLOAD_GPU; 2977 } 2978 2979 PetscFunctionReturn(0); 2980 } 2981 2982 /* --------------------------------------------------------------------------------*/ 2983 /*@ 2984 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2985 (the default parallel PETSc format). This matrix will ultimately pushed down 2986 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2987 assembly performance the user should preallocate the matrix storage by setting 2988 the parameter nz (or the array nnz). By setting these parameters accurately, 2989 performance during matrix assembly can be increased by more than a factor of 50. 2990 2991 Collective 2992 2993 Input Parameters: 2994 + comm - MPI communicator, set to PETSC_COMM_SELF 2995 . m - number of rows 2996 . n - number of columns 2997 . nz - number of nonzeros per row (same for all rows) 2998 - nnz - array containing the number of nonzeros in the various rows 2999 (possibly different for each row) or NULL 3000 3001 Output Parameter: 3002 . A - the matrix 3003 3004 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3005 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3006 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3007 3008 Notes: 3009 If nnz is given then nz is ignored 3010 3011 The AIJ format (also called the Yale sparse matrix format or 3012 compressed row storage), is fully compatible with standard Fortran 77 3013 storage. That is, the stored row and column indices can begin at 3014 either one (as in Fortran) or zero. See the users' manual for details. 3015 3016 Specify the preallocated storage with either nz or nnz (not both). 3017 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3018 allocation. For large problems you MUST preallocate memory or you 3019 will get TERRIBLE performance, see the users' manual chapter on matrices. 3020 3021 By default, this format uses inodes (identical nodes) when possible, to 3022 improve numerical efficiency of matrix-vector products and solves. We 3023 search for consecutive rows with the same nonzero structure, thereby 3024 reusing matrix information to achieve increased efficiency. 3025 3026 Level: intermediate 3027 3028 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3029 @*/ 3030 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3031 { 3032 PetscErrorCode ierr; 3033 3034 PetscFunctionBegin; 3035 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3036 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3037 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3038 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3039 PetscFunctionReturn(0); 3040 } 3041 3042 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3043 { 3044 PetscErrorCode ierr; 3045 PetscSplitCSRDataStructure *d_mat = NULL; 3046 3047 PetscFunctionBegin; 3048 if (A->factortype == MAT_FACTOR_NONE) { 3049 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 3050 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 3051 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3052 } else { 3053 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3054 } 3055 if (d_mat) { 3056 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3057 cudaError_t err; 3058 PetscSplitCSRDataStructure h_mat; 3059 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 3060 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 3061 if (a->compressedrow.use) { 3062 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 3063 } 3064 err = cudaFree(d_mat);CHKERRCUDA(err); 3065 } 3066 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3067 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3068 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3069 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3070 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3071 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3072 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3073 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3074 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3075 PetscFunctionReturn(0); 3076 } 3077 3078 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3079 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3080 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3081 { 3082 PetscErrorCode ierr; 3083 3084 PetscFunctionBegin; 3085 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3086 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3087 PetscFunctionReturn(0); 3088 } 3089 3090 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3091 { 3092 PetscErrorCode ierr; 3093 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3094 Mat_SeqAIJCUSPARSE *cy; 3095 Mat_SeqAIJCUSPARSE *cx; 3096 PetscScalar *ay; 3097 const PetscScalar *ax; 3098 CsrMatrix *csry,*csrx; 3099 cudaError_t cerr; 3100 3101 PetscFunctionBegin; 3102 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3103 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3104 if (X->ops->axpy != Y->ops->axpy) { 3105 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3106 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3107 PetscFunctionReturn(0); 3108 } 3109 /* if we are here, it means both matrices are bound to GPU */ 3110 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3111 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3112 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3113 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3114 csry = (CsrMatrix*)cy->mat->mat; 3115 csrx = (CsrMatrix*)cx->mat->mat; 3116 /* see if we can turn this into a cublas axpy */ 3117 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3118 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3119 if (eq) { 3120 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3121 } 3122 if (eq) str = SAME_NONZERO_PATTERN; 3123 } 3124 /* spgeam is buggy with one column */ 3125 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3126 3127 if (str == SUBSET_NONZERO_PATTERN) { 3128 cusparseStatus_t stat; 3129 PetscScalar b = 1.0; 3130 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3131 size_t bufferSize; 3132 void *buffer; 3133 #endif 3134 3135 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3136 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3137 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3138 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3139 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3140 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3141 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3142 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3143 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3144 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3145 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3146 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3147 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3148 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3149 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3150 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3151 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3152 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3153 #else 3154 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3155 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3156 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3157 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3158 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3159 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3160 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3161 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3162 #endif 3163 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3164 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3165 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3166 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3167 } else if (str == SAME_NONZERO_PATTERN) { 3168 cublasHandle_t cublasv2handle; 3169 cublasStatus_t berr; 3170 PetscBLASInt one = 1, bnz = 1; 3171 3172 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3173 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3174 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3175 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3176 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3177 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3178 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3179 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3180 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3181 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3182 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3183 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3184 } else { 3185 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3186 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3187 } 3188 PetscFunctionReturn(0); 3189 } 3190 3191 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3192 { 3193 PetscErrorCode ierr; 3194 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3195 PetscScalar *ay; 3196 cudaError_t cerr; 3197 cublasHandle_t cublasv2handle; 3198 cublasStatus_t berr; 3199 PetscBLASInt one = 1, bnz = 1; 3200 3201 PetscFunctionBegin; 3202 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3203 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3204 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3205 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3206 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3207 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3208 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3209 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3210 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3211 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3212 PetscFunctionReturn(0); 3213 } 3214 3215 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3216 { 3217 PetscErrorCode ierr; 3218 PetscBool both = PETSC_FALSE; 3219 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3220 3221 PetscFunctionBegin; 3222 if (A->factortype == MAT_FACTOR_NONE) { 3223 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3224 if (spptr->mat) { 3225 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3226 if (matrix->values) { 3227 both = PETSC_TRUE; 3228 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3229 } 3230 } 3231 if (spptr->matTranspose) { 3232 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3233 if (matrix->values) { 3234 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3235 } 3236 } 3237 } 3238 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3239 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3240 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3241 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3242 else A->offloadmask = PETSC_OFFLOAD_CPU; 3243 3244 PetscFunctionReturn(0); 3245 } 3246 3247 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3248 { 3249 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3250 PetscErrorCode ierr; 3251 3252 PetscFunctionBegin; 3253 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3254 if (flg) { 3255 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3256 3257 A->ops->scale = MatScale_SeqAIJ; 3258 A->ops->axpy = MatAXPY_SeqAIJ; 3259 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3260 A->ops->mult = MatMult_SeqAIJ; 3261 A->ops->multadd = MatMultAdd_SeqAIJ; 3262 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3263 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3264 A->ops->multhermitiantranspose = NULL; 3265 A->ops->multhermitiantransposeadd = NULL; 3266 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3267 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3268 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3269 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3270 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3271 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3272 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3273 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3274 } else { 3275 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3276 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3277 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3278 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3279 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3280 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3281 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3282 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3283 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3284 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3285 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3286 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3287 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3288 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3289 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3290 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3291 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3292 } 3293 A->boundtocpu = flg; 3294 a->inode.use = flg; 3295 PetscFunctionReturn(0); 3296 } 3297 3298 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3299 { 3300 PetscErrorCode ierr; 3301 cusparseStatus_t stat; 3302 Mat B; 3303 3304 PetscFunctionBegin; 3305 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3306 if (reuse == MAT_INITIAL_MATRIX) { 3307 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3308 } else if (reuse == MAT_REUSE_MATRIX) { 3309 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3310 } 3311 B = *newmat; 3312 3313 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3314 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3315 3316 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3317 if (B->factortype == MAT_FACTOR_NONE) { 3318 Mat_SeqAIJCUSPARSE *spptr; 3319 3320 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3321 spptr->format = MAT_CUSPARSE_CSR; 3322 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3323 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3324 B->spptr = spptr; 3325 spptr->deviceMat = NULL; 3326 } else { 3327 Mat_SeqAIJCUSPARSETriFactors *spptr; 3328 3329 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3330 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3331 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3332 B->spptr = spptr; 3333 } 3334 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3335 } 3336 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3337 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3338 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3339 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3340 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3341 3342 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3343 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3344 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3345 PetscFunctionReturn(0); 3346 } 3347 3348 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3349 { 3350 PetscErrorCode ierr; 3351 3352 PetscFunctionBegin; 3353 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3354 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3355 ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr); 3356 ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr); 3357 ierr = PetscOptionsEnd();CHKERRQ(ierr); 3358 PetscFunctionReturn(0); 3359 } 3360 3361 /*MC 3362 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3363 3364 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3365 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3366 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3367 3368 Options Database Keys: 3369 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3370 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3371 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3372 3373 Level: beginner 3374 3375 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3376 M*/ 3377 3378 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 3379 3380 3381 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3382 { 3383 PetscErrorCode ierr; 3384 3385 PetscFunctionBegin; 3386 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3387 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3388 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3389 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3390 PetscFunctionReturn(0); 3391 } 3392 3393 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3394 { 3395 PetscErrorCode ierr; 3396 cusparseStatus_t stat; 3397 3398 PetscFunctionBegin; 3399 if (*cusparsestruct) { 3400 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3401 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3402 delete (*cusparsestruct)->workVector; 3403 delete (*cusparsestruct)->rowoffsets_gpu; 3404 delete (*cusparsestruct)->cooPerm; 3405 delete (*cusparsestruct)->cooPerm_a; 3406 delete (*cusparsestruct)->csr2csc_i; 3407 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3408 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3409 } 3410 PetscFunctionReturn(0); 3411 } 3412 3413 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3414 { 3415 PetscFunctionBegin; 3416 if (*mat) { 3417 delete (*mat)->values; 3418 delete (*mat)->column_indices; 3419 delete (*mat)->row_offsets; 3420 delete *mat; 3421 *mat = 0; 3422 } 3423 PetscFunctionReturn(0); 3424 } 3425 3426 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3427 { 3428 cusparseStatus_t stat; 3429 PetscErrorCode ierr; 3430 3431 PetscFunctionBegin; 3432 if (*trifactor) { 3433 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3434 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3435 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3436 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3437 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3438 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3439 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3440 #endif 3441 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3442 } 3443 PetscFunctionReturn(0); 3444 } 3445 3446 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3447 { 3448 CsrMatrix *mat; 3449 cusparseStatus_t stat; 3450 cudaError_t err; 3451 3452 PetscFunctionBegin; 3453 if (*matstruct) { 3454 if ((*matstruct)->mat) { 3455 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3456 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3457 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3458 #else 3459 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3460 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3461 #endif 3462 } else { 3463 mat = (CsrMatrix*)(*matstruct)->mat; 3464 CsrMatrix_Destroy(&mat); 3465 } 3466 } 3467 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3468 delete (*matstruct)->cprowIndices; 3469 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3470 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3471 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3472 3473 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3474 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3475 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3476 for (int i=0; i<3; i++) { 3477 if (mdata->cuSpMV[i].initialized) { 3478 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3479 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3480 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3481 } 3482 } 3483 #endif 3484 delete *matstruct; 3485 *matstruct = NULL; 3486 } 3487 PetscFunctionReturn(0); 3488 } 3489 3490 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3491 { 3492 PetscErrorCode ierr; 3493 3494 PetscFunctionBegin; 3495 if (*trifactors) { 3496 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3497 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3498 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3499 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3500 delete (*trifactors)->rpermIndices; 3501 delete (*trifactors)->cpermIndices; 3502 delete (*trifactors)->workVector; 3503 (*trifactors)->rpermIndices = NULL; 3504 (*trifactors)->cpermIndices = NULL; 3505 (*trifactors)->workVector = NULL; 3506 } 3507 PetscFunctionReturn(0); 3508 } 3509 3510 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3511 { 3512 PetscErrorCode ierr; 3513 cusparseHandle_t handle; 3514 cusparseStatus_t stat; 3515 3516 PetscFunctionBegin; 3517 if (*trifactors) { 3518 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3519 if (handle = (*trifactors)->handle) { 3520 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3521 } 3522 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3523 } 3524 PetscFunctionReturn(0); 3525 } 3526 3527 struct IJCompare 3528 { 3529 __host__ __device__ 3530 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3531 { 3532 if (t1.get<0>() < t2.get<0>()) return true; 3533 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3534 return false; 3535 } 3536 }; 3537 3538 struct IJEqual 3539 { 3540 __host__ __device__ 3541 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3542 { 3543 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3544 return true; 3545 } 3546 }; 3547 3548 struct IJDiff 3549 { 3550 __host__ __device__ 3551 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3552 { 3553 return t1 == t2 ? 0 : 1; 3554 } 3555 }; 3556 3557 struct IJSum 3558 { 3559 __host__ __device__ 3560 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3561 { 3562 return t1||t2; 3563 } 3564 }; 3565 3566 #include <thrust/iterator/discard_iterator.h> 3567 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3568 { 3569 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3570 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3571 THRUSTARRAY *cooPerm_v = NULL; 3572 thrust::device_ptr<const PetscScalar> d_v; 3573 CsrMatrix *matrix; 3574 PetscErrorCode ierr; 3575 cudaError_t cerr; 3576 PetscInt n; 3577 3578 PetscFunctionBegin; 3579 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3580 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3581 if (!cusp->cooPerm) { 3582 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3583 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3584 PetscFunctionReturn(0); 3585 } 3586 matrix = (CsrMatrix*)cusp->mat->mat; 3587 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3588 if (!v) { 3589 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3590 goto finalize; 3591 } 3592 n = cusp->cooPerm->size(); 3593 if (isCudaMem(v)) { 3594 d_v = thrust::device_pointer_cast(v); 3595 } else { 3596 cooPerm_v = new THRUSTARRAY(n); 3597 cooPerm_v->assign(v,v+n); 3598 d_v = cooPerm_v->data(); 3599 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3600 } 3601 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3602 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3603 if (cusp->cooPerm_a) { 3604 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3605 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3606 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3607 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3608 delete cooPerm_w; 3609 } else { 3610 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3611 matrix->values->begin())); 3612 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3613 matrix->values->end())); 3614 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 3615 } 3616 } else { 3617 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3618 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3619 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3620 } else { 3621 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3622 matrix->values->begin())); 3623 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3624 matrix->values->end())); 3625 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3626 } 3627 } 3628 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3629 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3630 finalize: 3631 delete cooPerm_v; 3632 A->offloadmask = PETSC_OFFLOAD_GPU; 3633 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3634 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3635 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3636 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3637 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3638 a->reallocs = 0; 3639 A->info.mallocs += 0; 3640 A->info.nz_unneeded = 0; 3641 A->assembled = A->was_assembled = PETSC_TRUE; 3642 A->num_ass++; 3643 PetscFunctionReturn(0); 3644 } 3645 3646 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3647 { 3648 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3649 PetscErrorCode ierr; 3650 3651 PetscFunctionBegin; 3652 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3653 if (!cusp) PetscFunctionReturn(0); 3654 if (destroy) { 3655 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3656 delete cusp->csr2csc_i; 3657 cusp->csr2csc_i = NULL; 3658 } 3659 cusp->transupdated = PETSC_FALSE; 3660 PetscFunctionReturn(0); 3661 } 3662 3663 #include <thrust/binary_search.h> 3664 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3665 { 3666 PetscErrorCode ierr; 3667 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3668 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3669 PetscInt cooPerm_n, nzr = 0; 3670 cudaError_t cerr; 3671 3672 PetscFunctionBegin; 3673 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3674 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3675 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3676 if (n != cooPerm_n) { 3677 delete cusp->cooPerm; 3678 delete cusp->cooPerm_a; 3679 cusp->cooPerm = NULL; 3680 cusp->cooPerm_a = NULL; 3681 } 3682 if (n) { 3683 THRUSTINTARRAY d_i(n); 3684 THRUSTINTARRAY d_j(n); 3685 THRUSTINTARRAY ii(A->rmap->n); 3686 3687 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3688 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3689 3690 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3691 d_i.assign(coo_i,coo_i+n); 3692 d_j.assign(coo_j,coo_j+n); 3693 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3694 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3695 3696 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3697 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3698 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 3699 *cusp->cooPerm_a = d_i; 3700 THRUSTINTARRAY w = d_j; 3701 3702 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 3703 if (nekey == ekey) { /* all entries are unique */ 3704 delete cusp->cooPerm_a; 3705 cusp->cooPerm_a = NULL; 3706 } else { /* I couldn't come up with a more elegant algorithm */ 3707 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 3708 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 3709 (*cusp->cooPerm_a)[0] = 0; 3710 w[0] = 0; 3711 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 3712 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 3713 } 3714 thrust::counting_iterator<PetscInt> search_begin(0); 3715 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 3716 search_begin, search_begin + A->rmap->n, 3717 ii.begin()); 3718 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3719 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3720 3721 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3722 a->singlemalloc = PETSC_FALSE; 3723 a->free_a = PETSC_TRUE; 3724 a->free_ij = PETSC_TRUE; 3725 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3726 a->i[0] = 0; 3727 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3728 a->nz = a->maxnz = a->i[A->rmap->n]; 3729 a->rmax = 0; 3730 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3731 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3732 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3733 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3734 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3735 for (PetscInt i = 0; i < A->rmap->n; i++) { 3736 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3737 nzr += (PetscInt)!!(nnzr); 3738 a->ilen[i] = a->imax[i] = nnzr; 3739 a->rmax = PetscMax(a->rmax,nnzr); 3740 } 3741 a->nonzerorowcnt = nzr; 3742 A->preallocated = PETSC_TRUE; 3743 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3744 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3745 } else { 3746 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3747 } 3748 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3749 3750 /* We want to allocate the CUSPARSE struct for matvec now. 3751 The code is so convoluted now that I prefer to copy zeros */ 3752 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3753 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3754 A->offloadmask = PETSC_OFFLOAD_CPU; 3755 A->nonzerostate++; 3756 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3757 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 3758 3759 A->assembled = PETSC_FALSE; 3760 A->was_assembled = PETSC_FALSE; 3761 PetscFunctionReturn(0); 3762 } 3763 3764 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 3765 { 3766 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3767 CsrMatrix *csr; 3768 PetscErrorCode ierr; 3769 3770 PetscFunctionBegin; 3771 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3772 PetscValidPointer(a,2); 3773 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3774 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3775 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3776 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3777 csr = (CsrMatrix*)cusp->mat->mat; 3778 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3779 *a = csr->values->data().get(); 3780 PetscFunctionReturn(0); 3781 } 3782 3783 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 3784 { 3785 PetscFunctionBegin; 3786 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3787 PetscValidPointer(a,2); 3788 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3789 *a = NULL; 3790 PetscFunctionReturn(0); 3791 } 3792 3793 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 3794 { 3795 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3796 CsrMatrix *csr; 3797 PetscErrorCode ierr; 3798 3799 PetscFunctionBegin; 3800 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3801 PetscValidPointer(a,2); 3802 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3803 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3804 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3805 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3806 csr = (CsrMatrix*)cusp->mat->mat; 3807 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3808 *a = csr->values->data().get(); 3809 A->offloadmask = PETSC_OFFLOAD_GPU; 3810 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3811 PetscFunctionReturn(0); 3812 } 3813 3814 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 3815 { 3816 PetscErrorCode ierr; 3817 3818 PetscFunctionBegin; 3819 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3820 PetscValidPointer(a,2); 3821 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3822 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3823 *a = NULL; 3824 PetscFunctionReturn(0); 3825 } 3826 3827 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 3828 { 3829 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3830 CsrMatrix *csr; 3831 PetscErrorCode ierr; 3832 3833 PetscFunctionBegin; 3834 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3835 PetscValidPointer(a,2); 3836 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3837 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3838 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3839 csr = (CsrMatrix*)cusp->mat->mat; 3840 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3841 *a = csr->values->data().get(); 3842 A->offloadmask = PETSC_OFFLOAD_GPU; 3843 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3844 PetscFunctionReturn(0); 3845 } 3846 3847 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 3848 { 3849 PetscErrorCode ierr; 3850 3851 PetscFunctionBegin; 3852 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3853 PetscValidPointer(a,2); 3854 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3855 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3856 *a = NULL; 3857 PetscFunctionReturn(0); 3858 } 3859 3860 struct IJCompare4 3861 { 3862 __host__ __device__ 3863 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 3864 { 3865 if (t1.get<0>() < t2.get<0>()) return true; 3866 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3867 return false; 3868 } 3869 }; 3870 3871 struct Shift 3872 { 3873 int _shift; 3874 3875 Shift(int shift) : _shift(shift) {} 3876 __host__ __device__ 3877 inline int operator() (const int &c) 3878 { 3879 return c + _shift; 3880 } 3881 }; 3882 3883 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */ 3884 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 3885 { 3886 PetscErrorCode ierr; 3887 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 3888 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 3889 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 3890 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3891 PetscInt Annz,Bnnz; 3892 cusparseStatus_t stat; 3893 PetscInt i,m,n,zero = 0; 3894 cudaError_t cerr; 3895 3896 PetscFunctionBegin; 3897 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3898 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 3899 PetscValidPointer(C,4); 3900 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3901 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 3902 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 3903 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 3904 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3905 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3906 if (reuse == MAT_INITIAL_MATRIX) { 3907 m = A->rmap->n; 3908 n = A->cmap->n + B->cmap->n; 3909 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 3910 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 3911 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3912 c = (Mat_SeqAIJ*)(*C)->data; 3913 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3914 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3915 Ccsr = new CsrMatrix; 3916 Cmat->cprowIndices = NULL; 3917 c->compressedrow.use = PETSC_FALSE; 3918 c->compressedrow.nrows = 0; 3919 c->compressedrow.i = NULL; 3920 c->compressedrow.rindex = NULL; 3921 Ccusp->workVector = NULL; 3922 Ccusp->nrows = m; 3923 Ccusp->mat = Cmat; 3924 Ccusp->mat->mat = Ccsr; 3925 Ccsr->num_rows = m; 3926 Ccsr->num_cols = n; 3927 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 3928 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3929 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3930 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3931 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3932 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3933 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3934 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3935 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3936 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3937 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 3938 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 3939 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr); 3940 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3941 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3942 3943 Acsr = (CsrMatrix*)Acusp->mat->mat; 3944 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 3945 Annz = (PetscInt)Acsr->column_indices->size(); 3946 Bnnz = (PetscInt)Bcsr->column_indices->size(); 3947 c->nz = Annz + Bnnz; 3948 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 3949 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3950 Ccsr->values = new THRUSTARRAY(c->nz); 3951 Ccsr->num_entries = c->nz; 3952 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 3953 if (c->nz) { 3954 auto Acoo = new THRUSTINTARRAY32(Annz); 3955 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 3956 auto Ccoo = new THRUSTINTARRAY32(c->nz); 3957 THRUSTINTARRAY32 *Aroff,*Broff; 3958 3959 if (a->compressedrow.use) { /* need full row offset */ 3960 if (!Acusp->rowoffsets_gpu) { 3961 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 3962 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 3963 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3964 } 3965 Aroff = Acusp->rowoffsets_gpu; 3966 } else Aroff = Acsr->row_offsets; 3967 if (b->compressedrow.use) { /* need full row offset */ 3968 if (!Bcusp->rowoffsets_gpu) { 3969 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3970 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 3971 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3972 } 3973 Broff = Bcusp->rowoffsets_gpu; 3974 } else Broff = Bcsr->row_offsets; 3975 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3976 stat = cusparseXcsr2coo(Acusp->handle, 3977 Aroff->data().get(), 3978 Annz, 3979 m, 3980 Acoo->data().get(), 3981 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3982 stat = cusparseXcsr2coo(Bcusp->handle, 3983 Broff->data().get(), 3984 Bnnz, 3985 m, 3986 Bcoo->data().get(), 3987 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3988 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 3989 auto Aperm = thrust::make_constant_iterator(1); 3990 auto Bperm = thrust::make_constant_iterator(0); 3991 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 3992 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 3993 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 3994 #else 3995 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 3996 auto Bcib = Bcsr->column_indices->begin(); 3997 auto Bcie = Bcsr->column_indices->end(); 3998 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 3999 #endif 4000 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4001 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4002 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4003 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4004 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4005 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4006 auto p1 = Ccusp->cooPerm->begin(); 4007 auto p2 = Ccusp->cooPerm->begin(); 4008 thrust::advance(p2,Annz); 4009 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4010 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4011 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4012 #endif 4013 auto cci = thrust::make_counting_iterator(zero); 4014 auto cce = thrust::make_counting_iterator(c->nz); 4015 #if 0 //Errors on SUMMIT cuda 11.1.0 4016 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4017 #else 4018 auto pred = thrust::identity<int>(); 4019 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4020 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4021 #endif 4022 stat = cusparseXcoo2csr(Ccusp->handle, 4023 Ccoo->data().get(), 4024 c->nz, 4025 m, 4026 Ccsr->row_offsets->data().get(), 4027 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4028 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4029 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4030 delete wPerm; 4031 delete Acoo; 4032 delete Bcoo; 4033 delete Ccoo; 4034 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4035 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4036 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4037 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4038 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4039 #endif 4040 if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */ 4041 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4042 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4043 CsrMatrix *CcsrT = new CsrMatrix; 4044 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4045 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4046 4047 Ccusp->transgen = PETSC_TRUE; 4048 Ccusp->transupdated = PETSC_TRUE; 4049 Ccusp->rowoffsets_gpu = NULL; 4050 CmatT->cprowIndices = NULL; 4051 CmatT->mat = CcsrT; 4052 CcsrT->num_rows = n; 4053 CcsrT->num_cols = m; 4054 CcsrT->num_entries = c->nz; 4055 4056 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4057 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4058 CcsrT->values = new THRUSTARRAY(c->nz); 4059 4060 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4061 auto rT = CcsrT->row_offsets->begin(); 4062 if (AT) { 4063 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4064 thrust::advance(rT,-1); 4065 } 4066 if (BT) { 4067 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4068 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4069 thrust::copy(titb,tite,rT); 4070 } 4071 auto cT = CcsrT->column_indices->begin(); 4072 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4073 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4074 auto vT = CcsrT->values->begin(); 4075 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4076 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4077 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4078 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4079 4080 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4081 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4082 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4083 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4084 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4085 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4086 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4087 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4088 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4089 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4090 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4091 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4092 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4093 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4094 #endif 4095 Ccusp->matTranspose = CmatT; 4096 } 4097 } 4098 4099 c->singlemalloc = PETSC_FALSE; 4100 c->free_a = PETSC_TRUE; 4101 c->free_ij = PETSC_TRUE; 4102 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4103 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4104 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4105 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4106 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4107 ii = *Ccsr->row_offsets; 4108 jj = *Ccsr->column_indices; 4109 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4110 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4111 } else { 4112 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4113 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4114 } 4115 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4116 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4117 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4118 c->maxnz = c->nz; 4119 c->nonzerorowcnt = 0; 4120 c->rmax = 0; 4121 for (i = 0; i < m; i++) { 4122 const PetscInt nn = c->i[i+1] - c->i[i]; 4123 c->ilen[i] = c->imax[i] = nn; 4124 c->nonzerorowcnt += (PetscInt)!!nn; 4125 c->rmax = PetscMax(c->rmax,nn); 4126 } 4127 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4128 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4129 (*C)->nonzerostate++; 4130 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4131 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4132 Ccusp->nonzerostate = (*C)->nonzerostate; 4133 (*C)->preallocated = PETSC_TRUE; 4134 } else { 4135 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4136 c = (Mat_SeqAIJ*)(*C)->data; 4137 if (c->nz) { 4138 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4139 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4140 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4141 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4142 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4143 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4144 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4145 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4146 Acsr = (CsrMatrix*)Acusp->mat->mat; 4147 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4148 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4149 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4150 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4151 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4152 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4153 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4154 auto pmid = Ccusp->cooPerm->begin(); 4155 thrust::advance(pmid,Acsr->num_entries); 4156 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4157 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4158 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4159 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4160 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4161 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4162 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4163 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4164 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4165 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4166 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4167 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4168 if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) { 4169 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4170 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4171 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4172 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4173 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4174 auto vT = CcsrT->values->begin(); 4175 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4176 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4177 Ccusp->transupdated = PETSC_TRUE; 4178 } 4179 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4180 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4181 } 4182 } 4183 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4184 (*C)->assembled = PETSC_TRUE; 4185 (*C)->was_assembled = PETSC_FALSE; 4186 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4187 PetscFunctionReturn(0); 4188 } 4189 4190 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4191 { 4192 PetscErrorCode ierr; 4193 bool dmem; 4194 const PetscScalar *av; 4195 cudaError_t cerr; 4196 4197 PetscFunctionBegin; 4198 dmem = isCudaMem(v); 4199 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4200 if (n && idx) { 4201 THRUSTINTARRAY widx(n); 4202 widx.assign(idx,idx+n); 4203 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4204 4205 THRUSTARRAY *w = NULL; 4206 thrust::device_ptr<PetscScalar> dv; 4207 if (dmem) { 4208 dv = thrust::device_pointer_cast(v); 4209 } else { 4210 w = new THRUSTARRAY(n); 4211 dv = w->data(); 4212 } 4213 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4214 4215 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4216 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4217 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4218 if (w) { 4219 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4220 } 4221 delete w; 4222 } else { 4223 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4224 } 4225 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4226 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4227 PetscFunctionReturn(0); 4228 } 4229